From 3a821ffdd2f238620c92620b4aaed4ea189365d4 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <Maneesh.Gupta@amd.com>
Date: Wed, 30 Jun 2021 04:17:44 -0400
Subject: [PATCH 01/38] Revert "Merge branch 'amd-staging' into
 amd-master-next"

This reverts commit ecacf90269b3401b48f5dd46a653c819e05bebe5.

Reason for revert: Root cause for SWDEV-293424

Change-Id: Ib91024a75e26314f3a9af2cad421d0aaf5e47f08
---
 .gitattributes                                |    19 -
 CMakeLists.txt                                |    65 +-
 README.md                                     |     6 +-
 amdocl/CL/cl.h                                |  1836 +++
 amdocl/CL/cl_egl.h                            |   132 +
 amdocl/CL/cl_ext.h                            |  1051 ++
 amdocl/CL/cl_gl.h                             |   171 +
 amdocl/CL/cl_gl_ext.h                         |    52 +
 amdocl/CL/cl_icd.h                            |  1269 ++
 amdocl/CL/cl_platform.h                       |  1384 ++
 amdocl/CL/cl_version.h                        |    86 +
 amdocl/CL/opencl.h                            |    47 +
 amdocl/EGL/egl.h                              |   329 +
 amdocl/EGL/eglext.h                           |   645 +
 amdocl/EGL/eglplatform.h                      |   125 +
 amdocl/KHR/khrplatform.h                      |   282 +
 amdocl/cl_common.hpp                          |   301 +
 amdocl/cl_debugger_amd.h                      |   694 +
 amdocl/cl_icd.cpp                             |   293 +
 amdocl/cl_icd_amd.h                           |   739 +
 amdocl/cl_kernel.h                            |   165 +
 amdocl/cl_profile_amd.h                       |   189 +
 amdocl/cl_thread_trace_amd.h                  |   363 +
 amdocl/gl_functions.hpp                       |    64 +
 amdocl/icd/loader/icd_dispatch.h              |   108 +
 bin/hip_embed_pch.sh                          |    43 +-
 bin/hipcc                                     |    21 +-
 bin/hipcc.bat                                 |     2 -
 bin/hipconfig.bat                             |     2 -
 bin/roc-obj                                   |   264 -
 cmake/FindROCR.cmake                          |    35 +
 cmake/FindROCT.cmake                          |    35 +
 docs/markdown/hip_faq.md                      |    14 +-
 docs/markdown/hip_kernel_language.md          |   100 +-
 docs/markdown/hip_programming_guide.md        |    21 +-
 docs/markdown/hip_terms.md                    |    58 +-
 docs/markdown/obj_tooling.md                  |    56 +-
 hip-config.cmake.in                           |    33 +-
 include/hip/amd_detail/channel_descriptor.h   |   348 +
 include/hip/amd_detail/concepts.hpp           |    30 +
 include/hip/amd_detail/cuda/cuda.h            |     1 +
 include/hip/amd_detail/cuda/math_functions.h  |     1 +
 include/hip/amd_detail/device_functions.h     |  1347 ++
 include/hip/amd_detail/device_library_decls.h |   118 +
 include/hip/amd_detail/driver_types.h         |   478 +
 .../hip/amd_detail/functional_grid_launch.hpp |   218 +
 include/hip/amd_detail/grid_launch.h          |    67 +
 include/hip/amd_detail/grid_launch.hpp        |    50 +
 .../hip/amd_detail/grid_launch_GGL.hpp        |    11 +-
 include/hip/amd_detail/helpers.hpp            |   137 +
 include/hip/amd_detail/hip_atomic.h           |   691 +
 include/hip/amd_detail/hip_common.h           |    32 +
 include/hip/amd_detail/hip_complex.h          |   309 +
 .../hip/amd_detail/hip_cooperative_groups.h   |   510 +
 .../hip_cooperative_groups_helper.h           |   180 +
 include/hip/amd_detail/hip_fp16.h             |  1662 +++
 include/hip/amd_detail/hip_fp16_gcc.h         |   254 +
 include/hip/amd_detail/hip_fp16_math_fwd.h    |    86 +
 include/hip/amd_detail/hip_ldg.h              |   100 +
 include/hip/amd_detail/hip_memory.h           |   114 +
 include/hip/amd_detail/hip_runtime.h          |   417 +
 include/hip/amd_detail/hip_runtime_api.h      |  4354 ++++++
 include/hip/amd_detail/hip_runtime_prof.h     |    77 +
 include/hip/amd_detail/hip_surface_types.h    |    54 +
 include/hip/amd_detail/hip_texture_types.h    |    97 +
 include/hip/amd_detail/hip_vector_types.h     |  1598 +++
 include/hip/amd_detail/hiprtc.h               |    94 +
 include/hip/amd_detail/host_defines.h         |    72 +
 include/hip/amd_detail/hsa_helpers.hpp        |   102 +
 include/hip/amd_detail/library_types.h        |    41 +
 include/hip/amd_detail/llvm_intrinsics.h      |    41 +
 .../amd_detail/macro_based_grid_launch.hpp    |   798 ++
 include/hip/amd_detail/math_functions.h       |  1502 +++
 include/hip/amd_detail/math_fwd.h             |   714 +
 include/hip/amd_detail/ockl_image.h           |   135 +
 include/hip/amd_detail/program_state.hpp      |   107 +
 include/hip/amd_detail/surface_functions.h    |    59 +
 .../hip/amd_detail/texture_fetch_functions.h  |   388 +
 include/hip/amd_detail/texture_functions.h    | 11102 ++++++++++++++++
 .../amd_detail/texture_indirect_functions.h   |   503 +
 include/hip/amd_detail/texture_types.h        |   109 +
 include/hip/channel_descriptor.h              |     4 +-
 include/hip/device_functions.h                |     2 +-
 include/hip/driver_types.h                    |     2 +-
 include/hip/hcc_detail                        |     1 +
 include/hip/hip_complex.h                     |     4 +-
 include/hip/hip_cooperative_groups.h          |     4 +-
 include/hip/hip_fp16.h                        |     2 +-
 include/hip/hip_runtime.h                     |     4 +-
 include/hip/hip_runtime_api.h                 |  3892 +-----
 include/hip/hip_texture_types.h               |     4 +-
 include/hip/hip_vector_types.h                |     2 +-
 include/hip/hiprtc.h                          |     4 +-
 include/hip/library_types.h                   |     2 +-
 include/hip/math_functions.h                  |     2 +-
 .../hip/nvidia_detail/channel_descriptor.h    |    28 +
 include/hip/nvidia_detail/hip_complex.h       |   119 +
 .../nvidia_detail/hip_cooperative_groups.h    |    12 +
 include/hip/nvidia_detail/hip_runtime.h       |   122 +
 include/hip/nvidia_detail/hip_runtime_api.h   |  2195 +++
 include/hip/nvidia_detail/hip_texture_types.h |     6 +
 include/hip/nvidia_detail/hiprtc.h            |   168 +
 include/hip/texture_types.h                   |     2 +-
 packaging/hip-base.postinst                   |     3 +-
 packaging/hip-base.prerm                      |     1 -
 packaging/hip-base.txt                        |    12 +-
 packaging/hip-rocclr.txt                      |    24 +-
 rocclr/CMakeLists.txt                         |   303 +
 rocclr/amd_hsa_elf.hpp                        |   118 +
 rocclr/cl_gl.cpp                              |  2432 ++++
 rocclr/cl_gl_amd.hpp                          |   379 +
 rocclr/cl_lqdflash_amd.cpp                    |   312 +
 rocclr/cl_lqdflash_amd.h                      |    58 +
 rocclr/fixme.cpp                              |    32 +
 rocclr/hip_activity.cpp                       |    35 +
 rocclr/hip_code_object.cpp                    |   782 ++
 rocclr/hip_code_object.hpp                    |   156 +
 rocclr/hip_context.cpp                        |   380 +
 rocclr/hip_conversions.hpp                    |   903 ++
 rocclr/hip_device.cpp                         |   240 +
 rocclr/hip_device_runtime.cpp                 |   563 +
 rocclr/hip_error.cpp                          |   176 +
 rocclr/hip_event.cpp                          |   452 +
 rocclr/hip_event.hpp                          |   151 +
 rocclr/hip_fatbin.cpp                         |   158 +
 rocclr/hip_fatbin.hpp                         |    87 +
 rocclr/hip_formatting.hpp                     |   853 ++
 rocclr/hip_global.cpp                         |   196 +
 rocclr/hip_global.hpp                         |   119 +
 rocclr/hip_graph.cpp                          |   415 +
 rocclr/hip_graph_capture.hpp                  |    48 +
 rocclr/hip_graph_helper.hpp                   |    35 +
 rocclr/hip_graph_internal.cpp                 |   364 +
 rocclr/hip_graph_internal.hpp                 |   355 +
 rocclr/hip_hcc.def.in                         |   293 +
 rocclr/hip_hcc.map.in                         |   310 +
 rocclr/hip_hcc.rc                             |    75 +
 rocclr/hip_hmm.cpp                            |   220 +
 rocclr/hip_intercept.cpp                      |    81 +
 rocclr/hip_internal.hpp                       |   353 +
 rocclr/hip_memory.cpp                         |  2852 ++++
 rocclr/hip_module.cpp                         |   672 +
 rocclr/hip_peer.cpp                           |   250 +
 rocclr/hip_platform.cpp                       |   942 ++
 rocclr/hip_platform.hpp                       |    97 +
 rocclr/hip_prof_api.h                         |   270 +
 rocclr/hip_prof_gen.py                        |   673 +
 rocclr/hip_profile.cpp                        |    40 +
 rocclr/hip_rtc.cpp                            |   419 +
 rocclr/hip_stream.cpp                         |   587 +
 rocclr/hip_stream_ops.cpp                     |   129 +
 rocclr/hip_surface.cpp                        |    37 +
 rocclr/hip_texture.cpp                        |  1303 ++
 rocclr/hiprtc_internal.hpp                    |    65 +
 rocclr/trace_helper.h                         |   246 +
 .../15_static_library/host_functions/Makefile |     2 +-
 tests/catch/CMakeLists.txt                    |    22 +-
 tests/catch/README.md                         |     7 +-
 tests/catch/hipTestMain/CMakeLists.txt        |    34 +-
 tests/catch/hipTestMain/hip_test_context.cc   |    66 +-
 tests/catch/hipTestMain/main.cc               |     2 +-
 tests/catch/include/hip_test_checkers.hh      |   164 -
 tests/catch/include/hip_test_common.hh        |    14 -
 tests/catch/include/hip_test_context.hh       |     5 +
 tests/catch/include/hip_test_kernels.hh       |    62 -
 tests/catch/multiproc/CMakeLists.txt          |    13 -
 tests/catch/multiproc/childMalloc.cc          |    62 -
 tests/catch/multiproc/hipMallocConcurrency.cc |   188 -
 tests/catch/unit/CMakeLists.txt               |     3 +-
 tests/catch/unit/deviceLib/CMakeLists.txt     |     9 -
 tests/catch/unit/kernels/add.cc               |     9 +-
 tests/catch/unit/rtc/CMakeLists.txt           |    18 +-
 tests/catch/unit/rtc/saxpy.cc                 |    81 +-
 tests/catch/unit/rtc/test.cc                  |     6 +
 tests/performance/memory/hipPerfMemFill.cpp   |   526 -
 .../hipDoublePrecisionMathDevice.cpp          |     3 -
 tests/src/deviceLib/hipIntegerIntrinsics.cpp  |     4 -
 tests/src/deviceLib/hipTestClock.cpp          |    10 +-
 tests/src/deviceLib/hip_funnelshift.cpp       |   252 -
 tests/src/g++/hipMalloc.cpp                   |     2 +-
 tests/src/printf/printf_common.h              |    15 -
 .../hipLaunchCoopMultiKernel.cpp              |     7 +-
 tests/src/runtimeApi/graph/hipGraph.cpp       |     6 +-
 .../memory/hipMallocManaged_MultiScenario.cpp |    29 +-
 .../runtimeApi/module/hipManagedKeyword.cpp   |    69 -
 .../module/hipModuleLaunchKernel.cpp          |    23 +-
 tests/src/texture/hipTextureMipmapObj2D.cpp   |    24 +-
 187 files changed, 62538 insertions(+), 6043 deletions(-)
 delete mode 100644 .gitattributes
 create mode 100644 amdocl/CL/cl.h
 create mode 100644 amdocl/CL/cl_egl.h
 create mode 100644 amdocl/CL/cl_ext.h
 create mode 100644 amdocl/CL/cl_gl.h
 create mode 100644 amdocl/CL/cl_gl_ext.h
 create mode 100644 amdocl/CL/cl_icd.h
 create mode 100644 amdocl/CL/cl_platform.h
 create mode 100644 amdocl/CL/cl_version.h
 create mode 100644 amdocl/CL/opencl.h
 create mode 100644 amdocl/EGL/egl.h
 create mode 100644 amdocl/EGL/eglext.h
 create mode 100644 amdocl/EGL/eglplatform.h
 create mode 100644 amdocl/KHR/khrplatform.h
 create mode 100644 amdocl/cl_common.hpp
 create mode 100644 amdocl/cl_debugger_amd.h
 create mode 100644 amdocl/cl_icd.cpp
 create mode 100644 amdocl/cl_icd_amd.h
 create mode 100644 amdocl/cl_kernel.h
 create mode 100644 amdocl/cl_profile_amd.h
 create mode 100644 amdocl/cl_thread_trace_amd.h
 create mode 100644 amdocl/gl_functions.hpp
 create mode 100644 amdocl/icd/loader/icd_dispatch.h
 delete mode 100644 bin/hipcc.bat
 delete mode 100644 bin/hipconfig.bat
 delete mode 100755 bin/roc-obj
 create mode 100644 cmake/FindROCR.cmake
 create mode 100644 cmake/FindROCT.cmake
 create mode 100644 include/hip/amd_detail/channel_descriptor.h
 create mode 100644 include/hip/amd_detail/concepts.hpp
 create mode 100644 include/hip/amd_detail/cuda/cuda.h
 create mode 100644 include/hip/amd_detail/cuda/math_functions.h
 create mode 100644 include/hip/amd_detail/device_functions.h
 create mode 100644 include/hip/amd_detail/device_library_decls.h
 create mode 100644 include/hip/amd_detail/driver_types.h
 create mode 100644 include/hip/amd_detail/functional_grid_launch.hpp
 create mode 100644 include/hip/amd_detail/grid_launch.h
 create mode 100644 include/hip/amd_detail/grid_launch.hpp
 rename tests/src/runtimeApi/module/managed_kernel.cpp => include/hip/amd_detail/grid_launch_GGL.hpp (84%)
 create mode 100644 include/hip/amd_detail/helpers.hpp
 create mode 100644 include/hip/amd_detail/hip_atomic.h
 create mode 100644 include/hip/amd_detail/hip_common.h
 create mode 100644 include/hip/amd_detail/hip_complex.h
 create mode 100644 include/hip/amd_detail/hip_cooperative_groups.h
 create mode 100644 include/hip/amd_detail/hip_cooperative_groups_helper.h
 create mode 100644 include/hip/amd_detail/hip_fp16.h
 create mode 100644 include/hip/amd_detail/hip_fp16_gcc.h
 create mode 100644 include/hip/amd_detail/hip_fp16_math_fwd.h
 create mode 100644 include/hip/amd_detail/hip_ldg.h
 create mode 100644 include/hip/amd_detail/hip_memory.h
 create mode 100644 include/hip/amd_detail/hip_runtime.h
 create mode 100644 include/hip/amd_detail/hip_runtime_api.h
 create mode 100644 include/hip/amd_detail/hip_runtime_prof.h
 create mode 100644 include/hip/amd_detail/hip_surface_types.h
 create mode 100644 include/hip/amd_detail/hip_texture_types.h
 create mode 100644 include/hip/amd_detail/hip_vector_types.h
 create mode 100644 include/hip/amd_detail/hiprtc.h
 create mode 100644 include/hip/amd_detail/host_defines.h
 create mode 100644 include/hip/amd_detail/hsa_helpers.hpp
 create mode 100644 include/hip/amd_detail/library_types.h
 create mode 100644 include/hip/amd_detail/llvm_intrinsics.h
 create mode 100644 include/hip/amd_detail/macro_based_grid_launch.hpp
 create mode 100644 include/hip/amd_detail/math_functions.h
 create mode 100644 include/hip/amd_detail/math_fwd.h
 create mode 100644 include/hip/amd_detail/ockl_image.h
 create mode 100644 include/hip/amd_detail/program_state.hpp
 create mode 100644 include/hip/amd_detail/surface_functions.h
 create mode 100644 include/hip/amd_detail/texture_fetch_functions.h
 create mode 100644 include/hip/amd_detail/texture_functions.h
 create mode 100644 include/hip/amd_detail/texture_indirect_functions.h
 create mode 100644 include/hip/amd_detail/texture_types.h
 create mode 120000 include/hip/hcc_detail
 create mode 100644 include/hip/nvidia_detail/channel_descriptor.h
 create mode 100644 include/hip/nvidia_detail/hip_complex.h
 create mode 100644 include/hip/nvidia_detail/hip_cooperative_groups.h
 create mode 100644 include/hip/nvidia_detail/hip_runtime.h
 create mode 100644 include/hip/nvidia_detail/hip_runtime_api.h
 create mode 100644 include/hip/nvidia_detail/hip_texture_types.h
 create mode 100644 include/hip/nvidia_detail/hiprtc.h
 create mode 100755 rocclr/CMakeLists.txt
 create mode 100644 rocclr/amd_hsa_elf.hpp
 create mode 100644 rocclr/cl_gl.cpp
 create mode 100644 rocclr/cl_gl_amd.hpp
 create mode 100644 rocclr/cl_lqdflash_amd.cpp
 create mode 100644 rocclr/cl_lqdflash_amd.h
 create mode 100644 rocclr/fixme.cpp
 create mode 100644 rocclr/hip_activity.cpp
 create mode 100755 rocclr/hip_code_object.cpp
 create mode 100755 rocclr/hip_code_object.hpp
 create mode 100755 rocclr/hip_context.cpp
 create mode 100644 rocclr/hip_conversions.hpp
 create mode 100644 rocclr/hip_device.cpp
 create mode 100755 rocclr/hip_device_runtime.cpp
 create mode 100644 rocclr/hip_error.cpp
 create mode 100755 rocclr/hip_event.cpp
 create mode 100644 rocclr/hip_event.hpp
 create mode 100755 rocclr/hip_fatbin.cpp
 create mode 100755 rocclr/hip_fatbin.hpp
 create mode 100644 rocclr/hip_formatting.hpp
 create mode 100755 rocclr/hip_global.cpp
 create mode 100755 rocclr/hip_global.hpp
 create mode 100644 rocclr/hip_graph.cpp
 create mode 100644 rocclr/hip_graph_capture.hpp
 create mode 100644 rocclr/hip_graph_helper.hpp
 create mode 100644 rocclr/hip_graph_internal.cpp
 create mode 100644 rocclr/hip_graph_internal.hpp
 create mode 100755 rocclr/hip_hcc.def.in
 create mode 100755 rocclr/hip_hcc.map.in
 create mode 100644 rocclr/hip_hcc.rc
 create mode 100644 rocclr/hip_hmm.cpp
 create mode 100755 rocclr/hip_intercept.cpp
 create mode 100755 rocclr/hip_internal.hpp
 create mode 100755 rocclr/hip_memory.cpp
 create mode 100755 rocclr/hip_module.cpp
 create mode 100755 rocclr/hip_peer.cpp
 create mode 100755 rocclr/hip_platform.cpp
 create mode 100755 rocclr/hip_platform.hpp
 create mode 100644 rocclr/hip_prof_api.h
 create mode 100755 rocclr/hip_prof_gen.py
 create mode 100644 rocclr/hip_profile.cpp
 create mode 100755 rocclr/hip_rtc.cpp
 create mode 100755 rocclr/hip_stream.cpp
 create mode 100644 rocclr/hip_stream_ops.cpp
 create mode 100644 rocclr/hip_surface.cpp
 create mode 100755 rocclr/hip_texture.cpp
 create mode 100644 rocclr/hiprtc_internal.hpp
 create mode 100644 rocclr/trace_helper.h
 delete mode 100644 tests/catch/include/hip_test_checkers.hh
 delete mode 100644 tests/catch/include/hip_test_kernels.hh
 delete mode 100644 tests/catch/multiproc/CMakeLists.txt
 delete mode 100644 tests/catch/multiproc/childMalloc.cc
 delete mode 100644 tests/catch/multiproc/hipMallocConcurrency.cc
 create mode 100644 tests/catch/unit/rtc/test.cc
 delete mode 100644 tests/performance/memory/hipPerfMemFill.cpp
 delete mode 100644 tests/src/deviceLib/hip_funnelshift.cpp
 delete mode 100644 tests/src/runtimeApi/module/hipManagedKeyword.cpp

diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index b84b57d149..0000000000
--- a/.gitattributes
+++ /dev/null
@@ -1,19 +0,0 @@
-# Set the default behavior, in case people don't have core.autolf set.
-* text=auto
-
-# Explicitly declare text files you want to always be normalized and converted
-# to have LF line endings on checkout.
-*.c text eol=lf
-*.cpp text eol=lf
-*.cc text eol=lf
-*.h text eol=lf
-*.hpp text eol=lf
-*.txt text eol=lf
-
-# auto remove white space
-*.cpp filter=trimspace
-*.c filter=trimspace
-*.h filter=trimspacecpp
-*.hpp filter=trimspace
-*.md filter=trimspace
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cd932fbb6..cb8e7f6dd8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,8 @@ project(hip)
 #      cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
 #  If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "/opt/rocm/hip".
 
+set(BUILD_SHARED_LIBS ON  CACHE BOOL "Build shared library (.so) or static lib (.a) ")
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
 #############################
@@ -48,12 +50,6 @@ else()
   set(_pchStatus 0)
 endif()
 
-# With HIP project split plan, for interim puropose it is REQUIRED to pass
-# HIP_AMD_BACKEND_SOURCE_DIR=<hipamd source dir> as cmake variable_
-if(NOT DEFINED HIP_AMD_BACKEND_SOURCE_DIR)
-  message(FATAL_ERROR "HIP_AMD_BACKEND_SOURCE_DIR not defined ")
-endif()
-
 #############################
 # Setup config generation
 #############################
@@ -67,8 +63,6 @@ endmacro()
 #############################
 # Setup version information
 #############################
-# hipconfig is a perl script and is not trivially invokable on Windows.
-if(NOT WIN32)
 # Determine HIP_BASE_VERSION
 set(ENV{HIP_PATH} "")
 execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version
@@ -78,13 +72,11 @@ string(REPLACE "." ";" VERSION_LIST ${HIP_BASE_VERSION})
 list(GET VERSION_LIST 0 HIP_VERSION_MAJOR)
 list(GET VERSION_LIST 1 HIP_VERSION_MINOR)
 set(HIP_VERSION_GITDATE 0)
-endif()
 
 find_package(Git)
 
 # FIXME: Two different version strings used.
-# Below we use UNIX commands, not compatible with Windows.
-if(GIT_FOUND AND (NOT WIN32))
+if(GIT_FOUND)
   # get date information based on UTC
   # use the last two digits of year + week number + day in the week as HIP_VERSION_GITDATE
   # use the commit date, instead of build date
@@ -218,6 +210,25 @@ message(STATUS "HIP Compiler: " ${HIP_COMPILER})
 add_to_config(_buildInfo HIP_RUNTIME)
 add_to_config(_buildInfo HIP_COMPILER)
 
+############  If HIP_PLATFORM is amd, HSA_PATH has to be defined ##################
+
+if(HIP_PLATFORM STREQUAL "amd")
+    # Determine HSA_PATH
+    if(NOT DEFINED HSA_PATH)
+        if(NOT DEFINED ENV{HSA_PATH})
+            set(HSA_PATH "/opt/rocm/hsa" CACHE PATH "Path to which HSA runtime has been installed")
+        else()
+            set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
+        endif()
+    endif()
+    if(IS_ABSOLUTE ${HSA_PATH} AND EXISTS ${HSA_PATH} AND IS_DIRECTORY ${HSA_PATH})
+        message(STATUS "Looking for HSA runtime in: " ${HSA_PATH})
+    else()
+        message(FATAL_ERROR "Don't know where to find HSA runtime. Please specify absolute path using -DHSA_PATH")
+    endif()
+endif()
+message(STATUS "HSA runtime in: " ${HSA_PATH})
+
 # Set default build type
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
@@ -269,13 +280,6 @@ if (BUILD_HIPIFY_CLANG)
     add_subdirectory(hipify-clang)
 endif()
 
-# Workaround for current versioning logic not being compatible with Windows
-if(WIN32)
-  set(HIP_VERSION_MAJOR 0)
-  set(HIP_VERSION_MINOR 0)
-  set(HIP_VERSION_GITDATE 0)
-endif()
-
 # Generate hip_version.h
 set(_versionInfoHeader
 "// Auto-generated by cmake\n
@@ -291,8 +295,11 @@ set(_versionInfoHeader
 file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
 
 if(HIP_RUNTIME STREQUAL "rocclr")
-  set(HIP_COMMON_DIR ${PROJECT_SOURCE_DIR})
-  add_subdirectory(${HIP_AMD_BACKEND_SOURCE_DIR} src/hipamd)
+    add_subdirectory(rocclr)
+
+    set(HIP_ROCclr_BUILD_FLAGS "${HIP_ROCclr_BUILD_FLAGS} -fPIC ${ROCclr_CXX_FLAGS} -I${HSA_PATH}/include")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_ROCclr_BUILD_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_ROCclr_BUILD_FLAGS}")
 endif()
 
 # Generate .hipInfo
@@ -322,20 +329,16 @@ install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin)
 execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
     RESULT_VARIABLE INSTALL_SOURCE)
 if(NOT ${INSTALL_SOURCE} EQUAL 0)
-    # Exclude .bat files on Linux.
-    if(WIN32)
-      install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS)
-    else()
-      install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS
-              PATTERN *.bat EXCLUDE)
+    if(HIP_RUNTIME STREQUAL "rocclr")
+        install(DIRECTORY rocclr DESTINATION .)
     endif()
+    install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS)
 
     # The following two lines will be removed after upstream updation
     install(CODE "MESSAGE(\"Removing ${CMAKE_INSTALL_PREFIX}/include\")")
     install(CODE "file(REMOVE_RECURSE ${CMAKE_INSTALL_PREFIX}/include)")
 
     install(DIRECTORY include DESTINATION .)
-    install(DIRECTORY ${HIP_AMD_BACKEND_SOURCE_DIR}/include/hip/ DESTINATION include/hip/)
     install(DIRECTORY cmake DESTINATION .)
 endif()
 
@@ -351,8 +354,6 @@ install(FILES ${PROJECT_BINARY_DIR}/include/hip/hip_version.h
 #############################
 # hip-config
 #############################
-# Packaging invokes UNIX commands, which are not available on Windows.
-if(NOT WIN32)
 include(CMakePackageConfigHelpers)
 
 configure_package_config_file(
@@ -459,7 +460,6 @@ endif()
 if(POLICY CMP0037)
     cmake_policy(POP)
 endif()
-endif()
 
 #############################
 # Code analysis
@@ -485,20 +485,16 @@ endif()
 #############################
 # Testing steps
 #############################
-# HIT is not compatible with Windows
-if(NOT WIN32)
 set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
 if(HIP_PLATFORM STREQUAL "nvidia")
     execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${HIP_ROOT_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 endif()
-execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_AMD_BACKEND_SOURCE_DIR}/include/hip/" "${HIP_ROOT_DIR}/include/hip/" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 if(${RUN_HIT} EQUAL 0)
     execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 endif()
 if(HIP_CATCH_TEST EQUAL "1")
-    enable_testing()
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests/catch)
 else()
     if(${RUN_HIT} EQUAL 0)
@@ -509,7 +505,6 @@ else()
         message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
     endif()
 endif()
-endif()
 
 #############################
 # Code analysis
diff --git a/README.md b/README.md
index ff292dee17..4644e9a716 100644
--- a/README.md
+++ b/README.md
@@ -29,20 +29,20 @@ The HIP repository maintains several branches. The branches that are of importan
 
 HIP releases are typically naming convention for each ROCM release to help differentiate them.
 
-* rocm x.yy: These are the stable releases based on the ROCM release.
+* rocm x.yy: These are the stable releases based on the ROCM release. 
   This type of release is typically made once a month.*
 
 ## More Info:
 - [Installation](INSTALL.md)
 - [HIP FAQ](docs/markdown/hip_faq.md)
 - [HIP Kernel Language](docs/markdown/hip_kernel_language.md)
-- [HIP Runtime API (Doxygen)](https://github.com/RadeonOpenCompute/ROCm)
+- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP)
 - [HIP Porting Guide](docs/markdown/hip_porting_guide.md)
 - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md)
 - [HIP Programming Guide](docs/markdown/hip_programming_guide.md)
 - [HIP Logging ](docs/markdown/hip_logging.md)
 - [HIP Debugging ](docs/markdown/hip_debugging.md)
-- [Code Object tooling ](docs/markdown/obj_tooling.md)
+- [Code Object tooling ] (docs/markdown/obj_tooling.md)
 - [HIP Terminology](docs/markdown/hip_terms2.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/OpenCL)
 - [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/README.md)
 - Supported CUDA APIs:
diff --git a/amdocl/CL/cl.h b/amdocl/CL/cl.h
new file mode 100644
index 0000000000..cea6dc2405
--- /dev/null
+++ b/amdocl/CL/cl.h
@@ -0,0 +1,1836 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#include <CL/cl_version.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_device_svm_capabilities;
+#endif
+typedef cl_bitfield         cl_command_queue_properties;
+#ifdef CL_VERSION_1_2
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+#endif
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_queue_properties;
+#endif
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_svm_mem_flags;
+#endif
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+#ifdef CL_VERSION_1_2
+typedef cl_bitfield         cl_mem_migration_flags;
+#endif
+typedef cl_uint             cl_image_info;
+#ifdef CL_VERSION_1_1
+typedef cl_uint             cl_buffer_create_type;
+#endif
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+#ifdef CL_VERSION_2_0
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+#endif
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_program_binary_type;
+#endif
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+#endif
+typedef cl_uint             cl_kernel_work_group_info;
+#ifdef CL_VERSION_2_1
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+#endif
+#ifdef CL_EXPERIMENTAL
+typedef cl_bitfield         cl_device_atomic_capabilities;
+typedef cl_uint             cl_khronos_vendor_id;
+#endif
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+#ifdef CL_VERSION_1_2
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef CL_VERSION_2_0
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */
+#endif
+    union {
+#endif
+      cl_mem                  buffer;
+#ifdef CL_VERSION_2_0
+      cl_mem                  mem_object;
+    };
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+#endif
+} cl_image_desc;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+#endif
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#ifdef CL_VERSION_1_1
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#endif
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#ifdef CL_VERSION_1_1
+#define CL_INVALID_PROPERTY                         -64
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+#endif
+
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#ifdef CL_VERSION_1_2
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+#endif
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#ifdef CL_VERSION_2_1
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#endif
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#endif
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#endif
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#ifdef CL_VERSION_1_1
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#endif
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#ifdef CL_VERSION_1_1
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+#endif
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+#endif
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#ifdef CL_VERSION_1_1
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#endif
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#ifdef CL_VERSION_1_2
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+#endif
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_SIZE                               0x1094
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#endif
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#ifdef CL_VERSION_1_2
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+#endif
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#ifdef CL_VERSION_1_1
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+#endif
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#ifdef CL_VERSION_1_2
+#define CL_UNORM_INT24                              0x10DF
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_UNORM_INT_101010_2                       0x10E0
+#endif
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#ifdef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+#endif
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#ifdef CL_VERSION_1_1
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#endif
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#ifdef CL_VERSION_1_2
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
+#endif
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#ifdef CL_VERSION_1_1
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+#endif
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#ifdef CL_VERSION_2_0
+/* These enumerants are for the cl_khr_mipmap_image extension.
+   They have since been added to cl_ext.h with an appropriate
+   KHR suffix, but are left here for backwards compatibility. */
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#endif
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#ifdef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#endif
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_PROGRAM_IL                               0x1169
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+#endif
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+#endif
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#ifdef CL_VERSION_2_0
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+#endif
+
+#endif
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+#endif
+
+#ifdef CL_VERSION_2_1
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+#endif
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#ifdef CL_VERSION_1_1
+#define CL_EVENT_CONTEXT                            0x11D4
+#endif
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#ifdef CL_VERSION_1_1
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#endif
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#ifdef CL_VERSION_1_1
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+#endif
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#ifdef CL_VERSION_2_0
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+#endif
+
+#ifdef CL_EXPERIMENTAL
+
+/* cl_device_atomic_capabilities - bitfield */
+#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)
+#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)
+#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)
+#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)
+#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)
+#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)
+#define CL_DEVICE_ATOMIC_SCOPE_ALL_SVM_DEVICES  (1 << 6)
+
+/* cl_device_info */
+#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES                0x1063
+#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES                 0x1064
+#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT            0x1065
+#define CL_DEVICE_OPENCL_C_VERSIONS                         0x1066
+#define CL_DEVICE_MAX_WRITE_IMAGE3D_ARGS                    0x1067
+#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT   0x1068
+#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT             0x1069
+/* 0x106A to 0x106E - Reserved for upcoming KHR extension */
+#define CL_DEVICE_OPENCL_C_FEATURES                         0x106F
+
+/* cl_command_type */
+#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E
+
+#endif
+
+/* cl_khronos_vendor_id */
+#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           context,
+                               cl_device_id         device,
+                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    device,
+                        cl_ulong*       device_timestamp,
+                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id device,
+               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * properties,
+                cl_uint              num_devices,
+                const cl_device_id * devices,
+                void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                const void * private_info,
+                                                size_t       cb,
+                                                void *       user_data),
+                void *               user_data,
+                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type      device_type,
+                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                        const void * private_info,
+                                                        size_t       cb,
+                                                        void *       user_data),
+                        void *              user_data,
+                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         context,
+                 cl_context_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               context,
+                                   cl_device_id             device,
+                                   const cl_queue_properties *    properties,
+                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   buffer,
+                  cl_mem_flags             flags,
+                  cl_buffer_create_type    buffer_create_type,
+                  const void *             buffer_create_info,
+                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              context,
+              cl_mem_flags            flags,
+              const cl_image_format * image_format,
+              const cl_image_desc *   image_desc,
+              void *                  host_ptr,
+              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 context,
+             cl_mem_flags               flags,
+             cl_uint                    pipe_packet_size,
+             cl_uint                    pipe_max_packets,
+             const cl_pipe_properties * properties,
+             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           pipe,
+              cl_pipe_info     param_name,
+              size_t           param_value_size,
+              void *           param_value,
+              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem memobj,
+                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
+                                                                 void * user_data),
+                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* SVM Allocation APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       context,
+           cl_svm_mem_flags flags,
+           size_t           size,
+           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        context,
+          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+/* Sampler APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     context,
+                              const cl_sampler_properties *  sampler_properties,
+                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            context,
+                                  cl_uint               num_devices,
+                                  const cl_device_id *  device_list,
+                                  const char *          kernel_names,
+                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    context,
+                     const void*    il,
+                     size_t         length,
+                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options,
+               void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                void * user_data),
+               void *               user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           program,
+                 cl_uint              num_devices,
+                 const cl_device_id * device_list,
+                 const char *         options,
+                 cl_uint              num_input_headers,
+                 const cl_program *   input_headers,
+                 const char **        header_include_names,
+                 void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                  void * user_data),
+                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           context,
+              cl_uint              num_devices,
+              const cl_device_id * device_list,
+              const char *         options,
+              cl_uint              num_input_programs,
+              const cl_program *   input_programs,
+              void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                               void * user_data),
+              void *               user_data,
+              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          program,
+                            void (CL_CALLBACK * pfn_notify)(cl_program program,
+                                                            void * user_data),
+                            void *              user_data) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  program,
+                                   cl_uint     spec_id,
+                                   size_t      spec_size,
+                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     source_kernel,
+              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    kernel,
+                         cl_uint      arg_index,
+                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            kernel,
+                    cl_kernel_exec_info  param_name,
+                    size_t               param_value_size,
+                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       kernel,
+                   cl_uint         arg_indx,
+                   cl_kernel_arg_info  param_name,
+                   size_t          param_value_size,
+                   void *          param_value,
+                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   kernel,
+                        cl_device_id                device,
+                        cl_kernel_sub_group_info    param_name,
+                        size_t                      input_value_size,
+                        const void*                 input_value,
+                        size_t                      param_value_size,
+                        void*                       param_value,
+                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    context,
+                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   event,
+                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback(cl_event    event,
+                   cl_int      command_exec_callback_type,
+                   void (CL_CALLBACK * pfn_notify)(cl_event event,
+                                                   cl_int   event_command_status,
+                                                   void *   user_data),
+                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              size,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    command_queue,
+                        cl_mem              buffer,
+                        cl_bool             blocking_read,
+                        const size_t *      buffer_offset,
+                        const size_t *      host_offset,
+                        const size_t *      region,
+                        size_t              buffer_row_pitch,
+                        size_t              buffer_slice_pitch,
+                        size_t              host_row_pitch,
+                        size_t              host_slice_pitch,
+                        void *              ptr,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             size,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    command_queue,
+                         cl_mem              buffer,
+                         cl_bool             blocking_write,
+                         const size_t *      buffer_offset,
+                         const size_t *      host_offset,
+                         const size_t *      region,
+                         size_t              buffer_row_pitch,
+                         size_t              buffer_slice_pitch,
+                         size_t              host_row_pitch,
+                         size_t              host_slice_pitch,
+                         const void *        ptr,
+                         cl_uint             num_events_in_wait_list,
+                         const cl_event *    event_wait_list,
+                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    command_queue,
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer,
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              size,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    command_queue,
+                        cl_mem              src_buffer,
+                        cl_mem              dst_buffer,
+                        const size_t *      src_origin,
+                        const size_t *      dst_origin,
+                        const size_t *      region,
+                        size_t              src_row_pitch,
+                        size_t              src_slice_pitch,
+                        size_t              dst_row_pitch,
+                        size_t              dst_slice_pitch,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read,
+                   const size_t *       origin,
+                   const size_t *       region,
+                   size_t               row_pitch,
+                   size_t               slice_pitch,
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write,
+                    const size_t *      origin,
+                    const size_t *      region,
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     origin,
+                   const size_t *     region,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image,
+                   const size_t *       src_origin,
+                   const size_t *       dst_origin,
+                   const size_t *       region,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer,
+                           const size_t *   src_origin,
+                           const size_t *   region,
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image,
+                           size_t           src_offset,
+                           const size_t *   dst_origin,
+                           const size_t *   region,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map,
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           size,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image,
+                  cl_bool           blocking_map,
+                  cl_map_flags      map_flags,
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event * event_wait_list,
+                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
+                           cl_uint                num_mem_objects,
+                           const cl_mem *         mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint                num_events_in_wait_list,
+                           const cl_event *       event_wait_list,
+                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  command_queue,
+                      void (CL_CALLBACK * user_func)(void *),
+                      void *            args,
+                      size_t            cb_args,
+                      cl_uint           num_mem_objects,
+                      const cl_mem *    mem_list,
+                      const void **     args_mem_loc,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  command_queue,
+                            cl_uint           num_events_in_wait_list,
+                            const cl_event *  event_wait_list,
+                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  command_queue,
+                             cl_uint           num_events_in_wait_list,
+                             const cl_event *  event_wait_list,
+                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  command_queue,
+                 cl_uint           num_svm_pointers,
+                 void *            svm_pointers[],
+                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                    cl_uint          num_svm_pointers,
+                                                    void *           svm_pointers[],
+                                                    void *           user_data),
+                 void *            user_data,
+                 cl_uint           num_events_in_wait_list,
+                 const cl_event *  event_wait_list,
+                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  command_queue,
+                   cl_bool           blocking_copy,
+                   void *            dst_ptr,
+                   const void *      src_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  command_queue,
+                    void *            svm_ptr,
+                    const void *      pattern,
+                    size_t            pattern_size,
+                    size_t            size,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  command_queue,
+                cl_bool           blocking_map,
+                cl_map_flags      flags,
+                void *            svm_ptr,
+                size_t            size,
+                cl_uint           num_events_in_wait_list,
+                const cl_event *  event_wait_list,
+                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  command_queue,
+                  void *            svm_ptr,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         command_queue,
+                       cl_uint                  num_svm_pointers,
+                       const void **            svm_pointers,
+                       const size_t *           sizes,
+                       cl_mem_migration_flags   flags,
+                       cl_uint                  num_events_in_wait_list,
+                       const cl_event *         event_wait_list,
+                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              command_queue,
+                              cl_command_queue_properties   properties,
+                              cl_bool                       enable,
+                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue  command_queue,
+                        cl_uint          num_events,
+                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     context,
+                     cl_device_id                   device,
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords,
+                cl_addressing_mode  addressing_mode,
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
diff --git a/amdocl/CL/cl_egl.h b/amdocl/CL/cl_egl.h
new file mode 100644
index 0000000000..bc4d998eb3
--- /dev/null
+++ b/amdocl/CL/cl_egl.h
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  context,
+                        CLeglDisplayKHR             egldisplay,
+                        CLeglImageKHR               eglimage,
+                        cl_mem_flags                flags,
+                        const cl_egl_image_properties_khr * properties,
+                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+    cl_context                  context,
+    CLeglDisplayKHR             egldisplay,
+    CLeglImageKHR               eglimage,
+    cl_mem_flags                flags,
+    const cl_egl_image_properties_khr * properties,
+    cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      context,
+                            CLeglSyncKHR    sync,
+                            CLeglDisplayKHR display,
+                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+    cl_context      context,
+    CLeglSyncKHR    sync,
+    CLeglDisplayKHR display,
+    cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/amdocl/CL/cl_ext.h b/amdocl/CL/cl_ext.h
new file mode 100644
index 0000000000..4d6d8c093a
--- /dev/null
+++ b/amdocl/CL/cl_ext.h
@@ -0,0 +1,1051 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+
+#if CL_TARGET_OPENCL_VERSION <= 110
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
+                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
+                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
+                                            const void * private_info,
+                                            size_t       cb,
+                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          num_entries,
+                       cl_platform_id * platforms,
+                       cl_uint *        num_platforms);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
+                                         cl_platform_id * platforms,
+                                         cl_uint *        num_platforms);
+
+
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithILKHR(cl_context   context,
+                         const void * il,
+                         size_t       length,
+                         cl_int *     errcode_ret);
+
+typedef CL_API_ENTRY cl_program
+(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
+                                           const void * il,
+                                           size_t       length,
+                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+/* Extension: cl_khr_image2d_from_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without
+ * a copy. The type associated with a 2D image created from a buffer in an
+ * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
+ * built-in functions are supported for 2D images and 2D images created from
+ * a buffer.  Similarly, the write_image built-ins are also supported for 2D
+ * images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the
+ * width, height, image format (i.e. channel order and channel data type)
+ * and optionally the row pitch.
+ *
+ * The pitch specified must be a multiple of
+ * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
+ * The base address of the buffer must be aligned to
+ * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
+ */
+
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B
+
+
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR(cl_context context,
+                                      cl_device_id device,
+                                      const cl_queue_properties_khr* properties,
+                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
+                                                        cl_device_id device,
+                                                        const cl_queue_properties_khr* properties,
+                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_memory_flags *
+*********************************/
+#define cl_amd_device_memory_flags 1
+#define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD        0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD                   0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD                   0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD        0x404C
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD     0x4030
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD           0x4031
+#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
+#define CL_DEVICE_PCIE_ID_AMD                       0x4034
+
+typedef union
+{
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_uchar unused[17]; cl_uchar bus; cl_uchar device; cl_uchar function; } pcie;
+} cl_device_topology_amd;
+
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
+
+/**************************
+* cl_amd_offline_devices *
+**************************/
+#define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
+
+/********************************
+* cl_amd_bus_addressable_memory *
+********************************/
+
+/* cl_mem flag - bitfield */
+#define CL_MEM_BUS_ADDRESSABLE_AMD               (1<<30)
+#define CL_MEM_EXTERNAL_PHYSICAL_AMD             (1<<31)
+
+#define CL_COMMAND_WAIT_SIGNAL_AMD                0x4080
+#define CL_COMMAND_WRITE_SIGNAL_AMD               0x4081
+#define CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD      0x4082
+
+typedef struct _cl_bus_address_amd
+{
+    cl_ulong surface_bus_address;
+    cl_ulong marker_bus_address;
+} cl_bus_address_amd;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueWaitSignalAMD_fn)( cl_command_queue /*command_queue*/,
+                                           cl_mem /*mem_object*/,
+                                           cl_uint /*value*/,
+                                           cl_uint /*num_events*/,
+                                           const cl_event * /*event_wait_list*/,
+                                           cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueWriteSignalAMD_fn)( cl_command_queue /*command_queue*/,
+                                            cl_mem /*mem_object*/,
+                                            cl_uint /*value*/,
+                                            cl_ulong /*offset*/,
+                                            cl_uint /*num_events*/,
+                                            const cl_event * /*event_list*/,
+                                            cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueMakeBuffersResidentAMD_fn)( cl_command_queue /*command_queue*/,
+                                                    cl_uint /*num_mem_objs*/,
+                                                    cl_mem * /*mem_objects*/,
+                                                    cl_bool /*blocking_make_resident*/,
+                                                    cl_bus_address_amd * /*bus_addresses*/,
+                                                    cl_uint /*num_events*/,
+                                                    const cl_event * /*event_list*/,
+                                                    cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+/**********************
+* cl_amd_liquid_flash *
+***********************/
+#define cl_amd_liquid_flash 1
+
+#define CL_COMMAND_READ_SSG_FILE_AMD 0x4083
+#define CL_COMMAND_WRITE_SSG_FILE_AMD  0x4087
+
+#define CL_INVALID_FILE_OBJECT_AMD 0x4084
+
+typedef struct _cl_file_amd * cl_file_amd;
+
+typedef cl_uint cl_file_flags_amd;
+#define CL_FILE_READ_ONLY_AMD   (1 << 0)
+#define CL_FILE_WRITE_ONLY_AMD  (1 << 1)
+#define CL_FILE_READ_WRITE_AMD  (1 << 2)
+
+typedef cl_uint cl_file_info_amd;
+#define CL_FILE_BLOCK_SIZE_AMD 0x4085
+#define CL_FILE_SIZE_AMD       0x4086
+
+typedef CL_API_ENTRY cl_file_amd
+(CL_API_CALL * clCreateSsgFileObjectAMD_fn)(cl_context /*context*/,
+                                            cl_file_flags_amd /*flags*/,
+                                            const wchar_t * /*file_name*/,
+                                            cl_int * /*errcode_ret*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetSsgFileObjectInfoAMD_fn)(cl_file_amd /* file */,
+                                            cl_file_info_amd /* param_name */,
+                                            size_t /* param_value_size */,
+                                            void * /* param_value */,
+                                            size_t * /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clRetainSsgFileObjectAMD_fn)( cl_file_amd /*file*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clReleaseSsgFileObjectAMD_fn)( cl_file_amd /*file*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueReadSsgFileAMD_fn)(cl_command_queue /*command_queue*/,
+                                           cl_mem /*buffer*/,
+                                           cl_bool /*blocking_write*/,
+                                           size_t /*buffer_offset*/,
+                                           size_t /*cb*/,
+                                           cl_file_amd /*file*/,
+                                           size_t /*file_offset*/,
+                                           cl_uint /*num_events_in_wait_list*/,
+                                           const cl_event * /*event_wait_list*/,
+                                           cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueWriteSsgFileAMD_fn)(cl_command_queue /*command_queue*/,
+                                            cl_mem /*buffer*/,
+                                            cl_bool /*blocking_read*/,
+                                            size_t /*buffer_offset*/,
+                                            size_t /*cb*/,
+                                            cl_file_amd /*file*/,
+                                            size_t /*file_offset*/,
+                                            cl_uint /*num_events_in_wait_list*/,
+                                            const cl_event * /*event_wait_list*/,
+                                            cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+/*************************
+* cl_amd_copy_buffer_p2p *
+**************************/
+#define CL_DEVICE_NUM_P2P_DEVICES_AMD 0x4088
+#define CL_DEVICE_P2P_DEVICES_AMD 0x4089
+
+#define cl_amd_copy_buffer_p2p 1
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueCopyBufferP2PAMD_fn)(cl_command_queue /*command_queue*/,
+                                             cl_mem /*src_buffer*/,
+                                             cl_mem /*dst_buffer*/,
+                                             size_t /*src_offset*/,
+                                             size_t /*dst_offset*/,
+                                             size_t /*cb*/,
+                                             cl_uint /*num_events_in_wait_list*/,
+                                             const cl_event* /*event_wait_list*/,
+                                             cl_event* /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+/***********************************
+* cl_amd_assembly_program extension *
+***********************************/
+#define cl_amd_assembly_program   1
+
+typedef CL_API_ENTRY cl_program (CL_API_CALL * clCreateProgramWithAssemblyAMD_fn) (
+    cl_context          /* context */,
+    cl_uint             /* count */,
+    const char**        /* strings */,
+    const size_t*       /* lengths */,
+    cl_int*             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef CL_VERSION_2_0
+/********************************
+* cl_amd_planar_yuv *
+********************************/
+
+/* cl_mem flag - bitfield */
+#define CL_YUV_IMAGE_Y_PLANE_AMD    0x0
+#define CL_YUV_IMAGE_UV_PLANE_AMD   0x1
+
+typedef CL_API_ENTRY cl_mem
+(CL_API_CALL * clGetPlaneFromImageAMD_fn)(cl_context /*context*/,
+                                          cl_mem     /*mem*/,
+                                          cl_uint    /*plane*/,
+                                          cl_int *   /*errcode_ret*/) CL_EXT_SUFFIX__VERSION_2_0;
+#endif
+
+// <amd_internal>
+/**************************
+* cl_amd_command_queue_info *
+**************************/
+#define CL_QUEUE_THREAD_HANDLE_AMD                  0x403E
+
+/*   cl_kernel_exec_info for DVR DOPP texture support   */
+#define CL_KERNEL_EXEC_INFO_NEW_VCOP_AMD            0x4120
+#define CL_KERNEL_EXEC_INFO_PFPA_VCOP_AMD           0x4121
+
+/*************************
+* cl_amd_object_metadata *
+**************************/
+#define cl_amd_object_metadata 1
+
+typedef size_t cl_key_amd;
+
+#define CL_INVALID_OBJECT_AMD    0x403A
+#define CL_INVALID_KEY_AMD       0x403B
+#define CL_PLATFORM_MAX_KEYS_AMD 0x403C
+
+typedef CL_API_ENTRY cl_key_amd (CL_API_CALL * clCreateKeyAMD_fn)(
+    cl_platform_id      /* platform */,
+    void (CL_CALLBACK * /* destructor */)( void* /* old_value */),
+    cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clObjectGetValueForKeyAMD_fn)(
+    void *               /* object */,
+    cl_key_amd           /* key */,
+    void **              /* ret_val */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clObjectSetValueForKeyAMD_fn)(
+    void *               /* object */,
+    cl_key_amd           /* key */,
+    void *               /* value */) CL_API_SUFFIX__VERSION_1_1;
+// </amd_internal>
+
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(cl_device_id   in_device,
+                      const cl_device_partition_property_ext * properties,
+                      cl_uint        num_entries,
+                      cl_device_id * out_devices,
+                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
+                                         const cl_device_partition_property_ext * properties,
+                                         cl_uint        num_entries,
+                                         cl_device_id * out_devices,
+                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+
+/* clGetImageInfo enum */
+#define CL_IMAGE_BYTE_PITCH_AMD                     0x4059
+
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+
+typedef cl_bitfield cl_mem_migration_flags_ext;
+
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
+                             cl_uint          num_mem_objects,
+                             const cl_mem *   mem_objects,
+                             cl_mem_migration_flags_ext flags,
+                             cl_uint          num_events_in_wait_list,
+                             const cl_event * event_wait_list,
+                             cl_event *       event);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
+                                               cl_uint          num_mem_objects,
+                                               const cl_mem *   mem_objects,
+                                               cl_mem_migration_flags_ext flags,
+                                               cl_uint          num_events_in_wait_list,
+                                               const cl_event * event_wait_list,
+                                               cl_event *       event);
+
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
+                           cl_device_id in_device,
+                           cl_kernel_sub_group_info param_name,
+                           size_t       input_value_size,
+                           const void * input_value,
+                           size_t       param_value_size,
+                           void *       param_value,
+                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
+                                              cl_device_id in_device,
+                                              cl_kernel_sub_group_info param_name,
+                                              size_t       input_value_size,
+                                              const void * input_value,
+                                              size_t       param_value_size,
+                                              void *       param_value,
+                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+
+/*********************************
+* cl_khr_mipmap_image extension
+*********************************/
+
+/* cl_sampler_properties */
+#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
+#define CL_SAMPLER_LOD_MIN_KHR                      0x1156
+#define CL_SAMPLER_LOD_MAX_KHR                      0x1157
+
+
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+
+/*********************************
+* cl_khr_extended_versioning
+*********************************/
+
+#define CL_VERSION_MAJOR_BITS_KHR (10)
+#define CL_VERSION_MINOR_BITS_KHR (10)
+#define CL_VERSION_PATCH_BITS_KHR (12)
+
+#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
+#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
+#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
+
+#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
+#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
+#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
+
+#define CL_MAKE_VERSION_KHR(major, minor, patch) \
+    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
+    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
+    ((patch) & CL_VERSION_PATCH_MASK_KHR))
+
+typedef cl_uint cl_version_khr;
+
+#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
+
+typedef struct _cl_name_version_khr
+{
+    cl_version_khr version;
+    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
+} cl_name_version_khr;
+
+/* cl_platform_info */
+#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
+#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907
+
+/* cl_device_info */
+#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
+#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
+#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
+#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
+#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062
+
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Protected memory property */
+#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+
+/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
+
+/* Data consistency with host property */
+#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
+
+/* Import memory size value to indicate a size for the whole buffer */
+#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       context,
+              cl_svm_mem_flags_arm flags,
+              size_t           size,
+              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        context,
+             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  command_queue,
+                    cl_uint           num_svm_pointers,
+                    void *            svm_pointers[],
+                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                       cl_uint          num_svm_pointers,
+                                                       void *           svm_pointers[],
+                                                       void *           user_data),
+                    void *            user_data,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
+                      cl_bool           blocking_copy,
+                      void *            dst_ptr,
+                      const void *      src_ptr,
+                      size_t            size,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
+                       void *            svm_ptr,
+                       const void *      pattern,
+                       size_t            pattern_size,
+                       size_t            size,
+                       cl_uint           num_events_in_wait_list,
+                       const cl_event *  event_wait_list,
+                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  command_queue,
+                   cl_bool           blocking_map,
+                   cl_map_flags      flags,
+                   void *            svm_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
+                     void *            svm_ptr,
+                     cl_uint           num_events_in_wait_list,
+                     const cl_event *  event_wait_list,
+                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    kernel,
+                            cl_uint      arg_index,
+                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            kernel,
+                       cl_kernel_exec_info_arm  param_name,
+                       size_t               param_value_size,
+                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+/********************************
+ * cl_arm_get_core_id extension *
+ ********************************/
+
+#ifdef CL_VERSION_1_2
+
+#define cl_arm_get_core_id 1
+
+/* Device info property for bitfield of cores present */
+#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF
+
+#endif  /* CL_VERSION_1_2 */
+
+/*********************************
+* cl_arm_job_slot_selection
+*********************************/
+
+#define cl_arm_job_slot_selection 1
+
+/* cl_device_info */
+#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/amdocl/CL/cl_gl.h b/amdocl/CL/cl_gl.h
new file mode 100644
index 0000000000..fbdaf62977
--- /dev/null
+++ b/amdocl/CL/cl_gl.h
@@ -0,0 +1,171 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#ifdef CL_VERSION_1_2
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+#endif
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#ifdef CL_VERSION_1_2
+#define CL_GL_NUM_SAMPLES                       0x2012
+#endif
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     context,
+                     cl_mem_flags   flags,
+                     cl_GLuint      bufobj,
+                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   context,
+                           cl_mem_flags flags,
+                           cl_GLuint    renderbuffer,
+                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                memobj,
+                  cl_gl_object_type *   gl_object_type,
+                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               memobj,
+                   cl_gl_texture_info   param_name,
+                   size_t               param_value_size,
+                   void *               param_value,
+                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* cl_khr_gl_sharing extension  */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint     cl_gl_context_info;
+
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * properties,
+                      cl_gl_context_info            param_name,
+                      size_t                        param_value_size,
+                      void *                        param_value,
+                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/amdocl/CL/cl_gl_ext.h b/amdocl/CL/cl_gl_ext.h
new file mode 100644
index 0000000000..c26d31abed
--- /dev/null
+++ b/amdocl/CL/cl_gl_ext.h
@@ -0,0 +1,52 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl_gl.h>
+
+/* 
+ *  cl_khr_gl_event extension
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context context,
+                           cl_GLsync  cl_GLsync,
+                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/amdocl/CL/cl_icd.h b/amdocl/CL/cl_icd.h
new file mode 100644
index 0000000000..2be64719b6
--- /dev/null
+++ b/amdocl/CL/cl_icd.h
@@ -0,0 +1,1269 @@
+/*******************************************************************************
+ * Copyright (c) 2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_ICD_H
+#define OPENCL_CL_ICD_H
+
+#include <CL/cl.h>
+#include <CL/cl_egl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_gl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This file contains pointer type definitions for each of the CL API calls as
+ * well as a type definition for the dispatch table used by the Khronos ICD
+ * loader (see cl_khr_icd extension specification for background).
+ */
+
+/* API function pointer definitions */
+
+// Platform APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
+    cl_uint num_entries, cl_platform_id *platforms,
+    cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
+    cl_platform_id platform, cl_platform_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Device APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
+    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
+    cl_device_id device, cl_device_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
+    cl_device_id in_device,
+    const cl_device_partition_property *partition_properties,
+    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateSubDevices;
+typedef void *cl_api_clRetainDevice;
+typedef void *cl_api_clReleaseDevice;
+
+#endif
+
+// Context APIs
+typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(
+    const cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
+    const cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(
+    cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(
+    cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
+    cl_context context, cl_context_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Command Queue APIs
+typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
+    cl_context context, cl_device_id device,
+    cl_command_queue_properties properties,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY
+cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(
+    cl_context /* context */, cl_device_id /* device */,
+    const cl_queue_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreateCommandQueueWithProperties;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
+    cl_command_queue command_queue, cl_command_queue_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Memory Object APIs
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
+    cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    const cl_image_desc *image_desc, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateImage;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
+    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
+    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
+    cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,
+    cl_uint num_entries, cl_image_format *image_formats,
+    cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
+    cl_mem image, cl_image_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
+    const cl_pipe_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
+    cl_mem /* pipe */, cl_pipe_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(
+    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
+    unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(
+    cl_context /* context */,
+    void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreatePipe;
+typedef void *cl_api_clGetPipeInfo;
+typedef void *cl_api_clSVMAlloc;
+typedef void *cl_api_clSVMFree;
+
+#endif
+
+// Sampler APIs
+typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
+    cl_context context, cl_bool normalized_coords,
+    cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(
+    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
+    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
+    cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY
+cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(
+    cl_context /* context */,
+    const cl_sampler_properties * /* sampler_properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreateSamplerWithProperties;
+
+#endif
+
+// Program Object APIs
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
+    cl_context context, cl_uint count, const char **strings,
+    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY
+cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateProgramWithBuiltInKernels;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(
+    cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
+    cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_headers,
+    const cl_program *input_headers, const char **header_include_names,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_programs,
+    const cl_program *input_programs,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCompileProgram;
+typedef void *cl_api_clLinkProgram;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(
+    cl_program program, cl_uint spec_id, size_t spec_size,
+    const void *spec_value) CL_API_SUFFIX__VERSION_2_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
+    cl_program program,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_2_2;
+
+#else
+
+typedef void *cl_api_clSetProgramSpecializationConstant;
+typedef void *cl_api_clSetProgramReleaseCallback;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
+    cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clUnloadPlatformCompiler;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
+    cl_program program, cl_program_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Kernel Object APIs
+typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
+    cl_program program, const char *kernel_name,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
+    cl_program program, cl_uint num_kernels, cl_kernel *kernels,
+    cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(
+    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
+    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
+    cl_kernel kernel, cl_uint arg_index, size_t arg_size,
+    const void *arg_value) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
+    cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
+    cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clGetKernelArgInfo;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
+    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
+    size_t /* param_value_size */,
+    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
+    cl_kernel /* in_kernel */, cl_device_id /*in_device*/,
+    cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,
+    const void * /*input_value*/, size_t /*param_value_size*/,
+    void * /*param_value*/,
+    size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clSetKernelArgSVMPointer;
+typedef void *cl_api_clSetKernelExecInfo;
+typedef void *cl_api_clGetKernelSubGroupInfoKHR;
+
+#endif
+
+// Event Object APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
+    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
+    cl_event event, cl_event_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
+    CL_API_SUFFIX__VERSION_1_0;
+
+// Profiling APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
+    cl_event event, cl_profiling_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Flush and Finish APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+// Enqueued Commands APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueReadBufferRect;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueWriteBufferRect;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, const void *pattern,
+    size_t pattern_size, size_t offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueFillBuffer;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    size_t src_offset, size_t dst_offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueCopyBufferRect;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
+    const size_t *origin, const size_t *region, size_t input_row_pitch,
+    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
+    cl_command_queue command_queue, cl_mem image, const void *fill_color,
+    const size_t origin[3], const size_t region[3],
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueFillImage;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *region, size_t dst_offset,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
+    size_t src_offset, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
+    cl_map_flags map_flags, size_t offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
+    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
+    cl_command_queue command_queue, cl_uint num_mem_objects,
+    const cl_mem *mem_objects, cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueMigrateMemObjects;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
+    cl_command_queue command_queue, cl_kernel kernel,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
+    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
+    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
+    const void **args_mem_loc, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
+    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
+    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY void *(
+    CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(
+    cl_platform_id platform,
+    const char *function_name)CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueMarkerWithWaitList;
+typedef void *cl_api_clEnqueueBarrierWithWaitList;
+typedef void *cl_api_clGetExtensionFunctionAddressForPlatform;
+
+#endif
+
+// Shared Virtual Memory APIs
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    void ** /* svm_pointers */,
+    void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,
+                                     cl_uint /* num_svm_pointers */,
+                                     void ** /* svm_pointers[] */,
+                                     void * /* user_data */),
+    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
+    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
+    cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clEnqueueSVMFree;
+typedef void *cl_api_clEnqueueSVMMemcpy;
+typedef void *cl_api_clEnqueueSVMMemFill;
+typedef void *cl_api_clEnqueueSVMMap;
+typedef void *cl_api_clEnqueueSVMUnmap;
+
+#endif
+
+// Deprecated APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
+    cl_command_queue command_queue, cl_command_queue_properties properties,
+    cl_bool enable, cl_command_queue_properties *old_properties)
+    CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_row_pitch,
+    void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_depth,
+    size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,
+    cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
+    CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
+    cl_command_queue command_queue,
+    cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
+    cl_command_queue command_queue, cl_uint num_events,
+    const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
+    cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
+    const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+// GL and other APIs
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
+    cl_context context, cl_mem_flags flags, cl_GLuint bufobj,
+    int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
+    cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
+    cl_mem memobj, cl_gl_object_type *gl_object_type,
+    cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
+    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_gl_sharing */
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+/* cl_khr_gl_event */
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
+    cl_context context, cl_GLsync sync, cl_int *errcode_ret);
+
+#if defined(_WIN32)
+
+/* cl_khr_d3d10_sharing */
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
+    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices,
+    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR(
+    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags,
+                           ID3D10Buffer *resource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR(
+    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR(
+    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_d3d11_sharing */
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
+    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices,
+    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+/* cl_khr_dx9_media_sharing */
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(
+    cl_platform_id platform, cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
+    cl_context context, cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
+    cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+/* cl_khr_d3d11_sharing */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR(
+    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags,
+                           ID3D11Buffer *resource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR(
+    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR(
+    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_dx9_media_sharing */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR(
+    cl_platform_id platform, cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR(
+    cl_context context, cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
+    cl_uint plane, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+#else
+
+/* cl_khr_d3d10_sharing */
+typedef void *cl_api_clGetDeviceIDsFromD3D10KHR;
+typedef void *cl_api_clCreateFromD3D10BufferKHR;
+typedef void *cl_api_clCreateFromD3D10Texture2DKHR;
+typedef void *cl_api_clCreateFromD3D10Texture3DKHR;
+typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR;
+typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR;
+
+/* cl_khr_d3d11_sharing */
+typedef void *cl_api_clGetDeviceIDsFromD3D11KHR;
+typedef void *cl_api_clCreateFromD3D11BufferKHR;
+typedef void *cl_api_clCreateFromD3D11Texture2DKHR;
+typedef void *cl_api_clCreateFromD3D11Texture3DKHR;
+typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR;
+typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR;
+
+/* cl_khr_dx9_media_sharing */
+typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR;
+typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR;
+typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR;
+typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;
+
+#endif
+
+/* OpenCL 1.1 */
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
+    cl_event /* event */, cl_int /* command_exec_callback_type */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
+    cl_mem /* buffer */, cl_mem_flags /* flags */,
+    cl_buffer_create_type /* buffer_create_type */,
+    const void * /* buffer_create_info */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(
+    cl_mem /* memobj */,
+    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
+                                       void * /*user_data*/),
+    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
+    cl_context /* context */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
+    cl_event /* event */,
+    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clSetEventCallback;
+typedef void *cl_api_clCreateSubBuffer;
+typedef void *cl_api_clSetMemObjectDestructorCallback;
+typedef void *cl_api_clCreateUserEvent;
+typedef void *cl_api_clSetUserEventStatus;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
+    cl_device_id in_device,
+    const cl_device_partition_property_ext *partition_properties,
+    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_egl_image */
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
+    cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,
+    cl_mem_flags flags, const cl_egl_image_properties_khr *properties,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_egl_event */
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
+    cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,
+    cl_int *errcode_ret);
+
+#ifdef CL_VERSION_2_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
+    cl_context context, cl_device_id device,
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
+    cl_context context, const void *il, size_t length,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
+    cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
+    size_t input_value_size, const void *input_value, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
+    cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
+    cl_command_queue command_queue, cl_uint num_svm_pointers,
+    const void **svm_pointers, const size_t *sizes,
+    cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
+    cl_device_id device, cl_ulong *device_timestamp,
+    cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
+    cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+#else
+
+typedef void *cl_api_clSetDefaultDeviceCommandQueue;
+typedef void *cl_api_clCreateProgramWithIL;
+typedef void *cl_api_clGetKernelSubGroupInfo;
+typedef void *cl_api_clCloneKernel;
+typedef void *cl_api_clEnqueueSVMMigrateMem;
+typedef void *cl_api_clGetDeviceAndHostTimer;
+typedef void *cl_api_clGetHostTimer;
+
+#endif
+
+/* Vendor dispatch table struture */
+
+typedef struct _cl_icd_dispatch {
+  /* OpenCL 1.0 */
+  cl_api_clGetPlatformIDs clGetPlatformIDs;
+  cl_api_clGetPlatformInfo clGetPlatformInfo;
+  cl_api_clGetDeviceIDs clGetDeviceIDs;
+  cl_api_clGetDeviceInfo clGetDeviceInfo;
+  cl_api_clCreateContext clCreateContext;
+  cl_api_clCreateContextFromType clCreateContextFromType;
+  cl_api_clRetainContext clRetainContext;
+  cl_api_clReleaseContext clReleaseContext;
+  cl_api_clGetContextInfo clGetContextInfo;
+  cl_api_clCreateCommandQueue clCreateCommandQueue;
+  cl_api_clRetainCommandQueue clRetainCommandQueue;
+  cl_api_clReleaseCommandQueue clReleaseCommandQueue;
+  cl_api_clGetCommandQueueInfo clGetCommandQueueInfo;
+  cl_api_clSetCommandQueueProperty clSetCommandQueueProperty;
+  cl_api_clCreateBuffer clCreateBuffer;
+  cl_api_clCreateImage2D clCreateImage2D;
+  cl_api_clCreateImage3D clCreateImage3D;
+  cl_api_clRetainMemObject clRetainMemObject;
+  cl_api_clReleaseMemObject clReleaseMemObject;
+  cl_api_clGetSupportedImageFormats clGetSupportedImageFormats;
+  cl_api_clGetMemObjectInfo clGetMemObjectInfo;
+  cl_api_clGetImageInfo clGetImageInfo;
+  cl_api_clCreateSampler clCreateSampler;
+  cl_api_clRetainSampler clRetainSampler;
+  cl_api_clReleaseSampler clReleaseSampler;
+  cl_api_clGetSamplerInfo clGetSamplerInfo;
+  cl_api_clCreateProgramWithSource clCreateProgramWithSource;
+  cl_api_clCreateProgramWithBinary clCreateProgramWithBinary;
+  cl_api_clRetainProgram clRetainProgram;
+  cl_api_clReleaseProgram clReleaseProgram;
+  cl_api_clBuildProgram clBuildProgram;
+  cl_api_clUnloadCompiler clUnloadCompiler;
+  cl_api_clGetProgramInfo clGetProgramInfo;
+  cl_api_clGetProgramBuildInfo clGetProgramBuildInfo;
+  cl_api_clCreateKernel clCreateKernel;
+  cl_api_clCreateKernelsInProgram clCreateKernelsInProgram;
+  cl_api_clRetainKernel clRetainKernel;
+  cl_api_clReleaseKernel clReleaseKernel;
+  cl_api_clSetKernelArg clSetKernelArg;
+  cl_api_clGetKernelInfo clGetKernelInfo;
+  cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+  cl_api_clWaitForEvents clWaitForEvents;
+  cl_api_clGetEventInfo clGetEventInfo;
+  cl_api_clRetainEvent clRetainEvent;
+  cl_api_clReleaseEvent clReleaseEvent;
+  cl_api_clGetEventProfilingInfo clGetEventProfilingInfo;
+  cl_api_clFlush clFlush;
+  cl_api_clFinish clFinish;
+  cl_api_clEnqueueReadBuffer clEnqueueReadBuffer;
+  cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+  cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+  cl_api_clEnqueueReadImage clEnqueueReadImage;
+  cl_api_clEnqueueWriteImage clEnqueueWriteImage;
+  cl_api_clEnqueueCopyImage clEnqueueCopyImage;
+  cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+  cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+  cl_api_clEnqueueMapBuffer clEnqueueMapBuffer;
+  cl_api_clEnqueueMapImage clEnqueueMapImage;
+  cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+  cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+  cl_api_clEnqueueTask clEnqueueTask;
+  cl_api_clEnqueueNativeKernel clEnqueueNativeKernel;
+  cl_api_clEnqueueMarker clEnqueueMarker;
+  cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+  cl_api_clEnqueueBarrier clEnqueueBarrier;
+  cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+  cl_api_clCreateFromGLBuffer clCreateFromGLBuffer;
+  cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D;
+  cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D;
+  cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer;
+  cl_api_clGetGLObjectInfo clGetGLObjectInfo;
+  cl_api_clGetGLTextureInfo clGetGLTextureInfo;
+  cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+  cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+  cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR;
+
+  /* cl_khr_d3d10_sharing */
+  cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR;
+  cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR;
+  cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR;
+  cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR;
+  cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR;
+  cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR;
+
+  /* OpenCL 1.1 */
+  cl_api_clSetEventCallback clSetEventCallback;
+  cl_api_clCreateSubBuffer clCreateSubBuffer;
+  cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+  cl_api_clCreateUserEvent clCreateUserEvent;
+  cl_api_clSetUserEventStatus clSetUserEventStatus;
+  cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+  cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+  cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+
+  /* cl_ext_device_fission */
+  cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT;
+  cl_api_clRetainDeviceEXT clRetainDeviceEXT;
+  cl_api_clReleaseDeviceEXT clReleaseDeviceEXT;
+
+  /* cl_khr_gl_event */
+  cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR;
+
+  /* OpenCL 1.2 */
+  cl_api_clCreateSubDevices clCreateSubDevices;
+  cl_api_clRetainDevice clRetainDevice;
+  cl_api_clReleaseDevice clReleaseDevice;
+  cl_api_clCreateImage clCreateImage;
+  cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+  cl_api_clCompileProgram clCompileProgram;
+  cl_api_clLinkProgram clLinkProgram;
+  cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+  cl_api_clGetKernelArgInfo clGetKernelArgInfo;
+  cl_api_clEnqueueFillBuffer clEnqueueFillBuffer;
+  cl_api_clEnqueueFillImage clEnqueueFillImage;
+  cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+  cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+  cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+  cl_api_clGetExtensionFunctionAddressForPlatform
+      clGetExtensionFunctionAddressForPlatform;
+  cl_api_clCreateFromGLTexture clCreateFromGLTexture;
+
+  /* cl_khr_d3d11_sharing */
+  cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR;
+  cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR;
+  cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR;
+  cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR;
+  cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR;
+  cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR;
+  cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR;
+
+  /* cl_khr_dx9_media_sharing */
+  cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR
+      clGetDeviceIDsFromDX9MediaAdapterKHR;
+  cl_api_clEnqueueAcquireDX9MediaSurfacesKHR
+      clEnqueueAcquireDX9MediaSurfacesKHR;
+  cl_api_clEnqueueReleaseDX9MediaSurfacesKHR
+      clEnqueueReleaseDX9MediaSurfacesKHR;
+
+  /* cl_khr_egl_image */
+  cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+  cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+  cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
+  /* cl_khr_egl_event */
+  cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+  /* OpenCL 2.0 */
+  cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
+  cl_api_clCreatePipe clCreatePipe;
+  cl_api_clGetPipeInfo clGetPipeInfo;
+  cl_api_clSVMAlloc clSVMAlloc;
+  cl_api_clSVMFree clSVMFree;
+  cl_api_clEnqueueSVMFree clEnqueueSVMFree;
+  cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+  cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+  cl_api_clEnqueueSVMMap clEnqueueSVMMap;
+  cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+  cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+  cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+  cl_api_clSetKernelExecInfo clSetKernelExecInfo;
+
+  /* cl_khr_sub_groups */
+  cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR;
+
+  /* OpenCL 2.1 */
+  cl_api_clCloneKernel clCloneKernel;
+  cl_api_clCreateProgramWithIL clCreateProgramWithIL;
+  cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem;
+  cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer;
+  cl_api_clGetHostTimer clGetHostTimer;
+  cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo;
+  cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue;
+
+  /* OpenCL 2.2 */
+  cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback;
+  cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant;
+} cl_icd_dispatch;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* #ifndef OPENCL_CL_ICD_H */
diff --git a/amdocl/CL/cl_platform.h b/amdocl/CL/cl_platform.h
new file mode 100644
index 0000000000..7f4ddea5b3
--- /dev/null
+++ b/amdocl/CL/cl_platform.h
@@ -0,0 +1,1384 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#include <CL/cl_version.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
+#define CL_EXTENSION_WEAK_LINK
+#define CL_API_SUFFIX__VERSION_1_0
+#define CL_EXT_SUFFIX__VERSION_1_0
+#define CL_API_SUFFIX__VERSION_1_1
+#define CL_EXT_SUFFIX__VERSION_1_1
+#define CL_API_SUFFIX__VERSION_1_2
+#define CL_EXT_SUFFIX__VERSION_1_2
+#define CL_API_SUFFIX__VERSION_2_0
+#define CL_EXT_SUFFIX__VERSION_2_0
+#define CL_API_SUFFIX__VERSION_2_1
+#define CL_EXT_SUFFIX__VERSION_2_1
+#define CL_API_SUFFIX__VERSION_2_2
+#define CL_EXT_SUFFIX__VERSION_2_2
+
+
+#ifdef __GNUC__
+  #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
+  #define CL_EXT_PREFIX_DEPRECATED
+#elif defined(_WIN32)
+  #define CL_EXT_SUFFIX_DEPRECATED
+  #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
+#else
+  #define CL_EXT_SUFFIX_DEPRECATED
+  #define CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+ #endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short;
+typedef uint16_t        cl_ushort;
+typedef int32_t         cl_int;
+typedef uint32_t        cl_uint;
+typedef int64_t         cl_long;
+typedef uint64_t        cl_ulong;
+
+typedef uint16_t        cl_half;
+typedef float           cl_float;
+typedef double          cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/amdocl/CL/cl_version.h b/amdocl/CL/cl_version.h
new file mode 100644
index 0000000000..bb766cb9bb
--- /dev/null
+++ b/amdocl/CL/cl_version.h
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ * Copyright (c) 2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+
+
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#endif  /* __CL_VERSION_H */
diff --git a/amdocl/CL/opencl.h b/amdocl/CL/opencl.h
new file mode 100644
index 0000000000..143d1d2dc6
--- /dev/null
+++ b/amdocl/CL/opencl.h
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
diff --git a/amdocl/EGL/egl.h b/amdocl/EGL/egl.h
new file mode 100644
index 0000000000..99ea342a47
--- /dev/null
+++ b/amdocl/EGL/egl.h
@@ -0,0 +1,329 @@
+/* -*- mode: c; tab-width: 8; -*- */
+/* vi: set sw=4 ts=8: */
+/* Reference version of egl.h for EGL 1.4.
+ * $Revision: 9356 $ on $Date: 2009-10-21 02:52:25 -0700 (Wed, 21 Oct 2009) $
+ */
+
+/*
+** Copyright (c) 2007-2009 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+#ifndef __egl_h_
+#define __egl_h_
+
+/* All platform-dependent types and macro boilerplate (such as EGLAPI
+ * and EGLAPIENTRY) should go in eglplatform.h.
+ */
+#include <EGL/eglplatform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* EGL Types */
+/* EGLint is defined in eglplatform.h */
+typedef unsigned int EGLBoolean;
+typedef unsigned int EGLenum;
+typedef void *EGLConfig;
+typedef void *EGLContext;
+typedef void *EGLDisplay;
+typedef void *EGLSurface;
+typedef void *EGLClientBuffer;
+
+/* EGL Versioning */
+#define EGL_VERSION_1_0			1
+#define EGL_VERSION_1_1			1
+#define EGL_VERSION_1_2			1
+#define EGL_VERSION_1_3			1
+#define EGL_VERSION_1_4			1
+
+/* EGL Enumerants. Bitmasks and other exceptional cases aside, most
+ * enums are assigned unique values starting at 0x3000.
+ */
+
+/* EGL aliases */
+#define EGL_FALSE			0
+#define EGL_TRUE			1
+
+/* Out-of-band handle values */
+#define EGL_DEFAULT_DISPLAY		((EGLNativeDisplayType)0)
+#define EGL_NO_CONTEXT			((EGLContext)0)
+#define EGL_NO_DISPLAY			((EGLDisplay)0)
+#define EGL_NO_SURFACE			((EGLSurface)0)
+
+/* Out-of-band attribute value */
+#define EGL_DONT_CARE			((EGLint)-1)
+
+/* Errors / GetError return values */
+#define EGL_SUCCESS			0x3000
+#define EGL_NOT_INITIALIZED		0x3001
+#define EGL_BAD_ACCESS			0x3002
+#define EGL_BAD_ALLOC			0x3003
+#define EGL_BAD_ATTRIBUTE		0x3004
+#define EGL_BAD_CONFIG			0x3005
+#define EGL_BAD_CONTEXT			0x3006
+#define EGL_BAD_CURRENT_SURFACE		0x3007
+#define EGL_BAD_DISPLAY			0x3008
+#define EGL_BAD_MATCH			0x3009
+#define EGL_BAD_NATIVE_PIXMAP		0x300A
+#define EGL_BAD_NATIVE_WINDOW		0x300B
+#define EGL_BAD_PARAMETER		0x300C
+#define EGL_BAD_SURFACE			0x300D
+#define EGL_CONTEXT_LOST		0x300E	/* EGL 1.1 - IMG_power_management */
+
+/* Reserved 0x300F-0x301F for additional errors */
+
+/* Config attributes */
+#define EGL_BUFFER_SIZE			0x3020
+#define EGL_ALPHA_SIZE			0x3021
+#define EGL_BLUE_SIZE			0x3022
+#define EGL_GREEN_SIZE			0x3023
+#define EGL_RED_SIZE			0x3024
+#define EGL_DEPTH_SIZE			0x3025
+#define EGL_STENCIL_SIZE		0x3026
+#define EGL_CONFIG_CAVEAT		0x3027
+#define EGL_CONFIG_ID			0x3028
+#define EGL_LEVEL			0x3029
+#define EGL_MAX_PBUFFER_HEIGHT		0x302A
+#define EGL_MAX_PBUFFER_PIXELS		0x302B
+#define EGL_MAX_PBUFFER_WIDTH		0x302C
+#define EGL_NATIVE_RENDERABLE		0x302D
+#define EGL_NATIVE_VISUAL_ID		0x302E
+#define EGL_NATIVE_VISUAL_TYPE		0x302F
+#define EGL_SAMPLES			0x3031
+#define EGL_SAMPLE_BUFFERS		0x3032
+#define EGL_SURFACE_TYPE		0x3033
+#define EGL_TRANSPARENT_TYPE		0x3034
+#define EGL_TRANSPARENT_BLUE_VALUE	0x3035
+#define EGL_TRANSPARENT_GREEN_VALUE	0x3036
+#define EGL_TRANSPARENT_RED_VALUE	0x3037
+#define EGL_NONE			0x3038	/* Attrib list terminator */
+#define EGL_BIND_TO_TEXTURE_RGB		0x3039
+#define EGL_BIND_TO_TEXTURE_RGBA	0x303A
+#define EGL_MIN_SWAP_INTERVAL		0x303B
+#define EGL_MAX_SWAP_INTERVAL		0x303C
+#define EGL_LUMINANCE_SIZE		0x303D
+#define EGL_ALPHA_MASK_SIZE		0x303E
+#define EGL_COLOR_BUFFER_TYPE		0x303F
+#define EGL_RENDERABLE_TYPE		0x3040
+#define EGL_MATCH_NATIVE_PIXMAP		0x3041	/* Pseudo-attribute (not queryable) */
+#define EGL_CONFORMANT			0x3042
+
+/* Reserved 0x3041-0x304F for additional config attributes */
+
+/* Config attribute values */
+#define EGL_SLOW_CONFIG			0x3050	/* EGL_CONFIG_CAVEAT value */
+#define EGL_NON_CONFORMANT_CONFIG	0x3051	/* EGL_CONFIG_CAVEAT value */
+#define EGL_TRANSPARENT_RGB		0x3052	/* EGL_TRANSPARENT_TYPE value */
+#define EGL_RGB_BUFFER			0x308E	/* EGL_COLOR_BUFFER_TYPE value */
+#define EGL_LUMINANCE_BUFFER		0x308F	/* EGL_COLOR_BUFFER_TYPE value */
+
+/* More config attribute values, for EGL_TEXTURE_FORMAT */
+#define EGL_NO_TEXTURE			0x305C
+#define EGL_TEXTURE_RGB			0x305D
+#define EGL_TEXTURE_RGBA		0x305E
+#define EGL_TEXTURE_2D			0x305F
+
+/* Config attribute mask bits */
+#define EGL_PBUFFER_BIT			0x0001	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_PIXMAP_BIT			0x0002	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_WINDOW_BIT			0x0004	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_VG_COLORSPACE_LINEAR_BIT	0x0020	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_VG_ALPHA_FORMAT_PRE_BIT	0x0040	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_MULTISAMPLE_RESOLVE_BOX_BIT 0x0200	/* EGL_SURFACE_TYPE mask bits */
+#define EGL_SWAP_BEHAVIOR_PRESERVED_BIT 0x0400	/* EGL_SURFACE_TYPE mask bits */
+
+#define EGL_OPENGL_ES_BIT		0x0001	/* EGL_RENDERABLE_TYPE mask bits */
+#define EGL_OPENVG_BIT			0x0002	/* EGL_RENDERABLE_TYPE mask bits */
+#define EGL_OPENGL_ES2_BIT		0x0004	/* EGL_RENDERABLE_TYPE mask bits */
+#define EGL_OPENGL_BIT			0x0008	/* EGL_RENDERABLE_TYPE mask bits */
+
+/* QueryString targets */
+#define EGL_VENDOR			0x3053
+#define EGL_VERSION			0x3054
+#define EGL_EXTENSIONS			0x3055
+#define EGL_CLIENT_APIS			0x308D
+
+/* QuerySurface / SurfaceAttrib / CreatePbufferSurface targets */
+#define EGL_HEIGHT			0x3056
+#define EGL_WIDTH			0x3057
+#define EGL_LARGEST_PBUFFER		0x3058
+#define EGL_TEXTURE_FORMAT		0x3080
+#define EGL_TEXTURE_TARGET		0x3081
+#define EGL_MIPMAP_TEXTURE		0x3082
+#define EGL_MIPMAP_LEVEL		0x3083
+#define EGL_RENDER_BUFFER		0x3086
+#define EGL_VG_COLORSPACE		0x3087
+#define EGL_VG_ALPHA_FORMAT		0x3088
+#define EGL_HORIZONTAL_RESOLUTION	0x3090
+#define EGL_VERTICAL_RESOLUTION		0x3091
+#define EGL_PIXEL_ASPECT_RATIO		0x3092
+#define EGL_SWAP_BEHAVIOR		0x3093
+#define EGL_MULTISAMPLE_RESOLVE		0x3099
+
+/* EGL_RENDER_BUFFER values / BindTexImage / ReleaseTexImage buffer targets */
+#define EGL_BACK_BUFFER			0x3084
+#define EGL_SINGLE_BUFFER		0x3085
+
+/* OpenVG color spaces */
+#define EGL_VG_COLORSPACE_sRGB		0x3089	/* EGL_VG_COLORSPACE value */
+#define EGL_VG_COLORSPACE_LINEAR	0x308A	/* EGL_VG_COLORSPACE value */
+
+/* OpenVG alpha formats */
+#define EGL_VG_ALPHA_FORMAT_NONPRE	0x308B	/* EGL_ALPHA_FORMAT value */
+#define EGL_VG_ALPHA_FORMAT_PRE		0x308C	/* EGL_ALPHA_FORMAT value */
+
+/* Constant scale factor by which fractional display resolutions &
+ * aspect ratio are scaled when queried as integer values.
+ */
+#define EGL_DISPLAY_SCALING		10000
+
+/* Unknown display resolution/aspect ratio */
+#define EGL_UNKNOWN			((EGLint)-1)
+
+/* Back buffer swap behaviors */
+#define EGL_BUFFER_PRESERVED		0x3094	/* EGL_SWAP_BEHAVIOR value */
+#define EGL_BUFFER_DESTROYED		0x3095	/* EGL_SWAP_BEHAVIOR value */
+
+/* CreatePbufferFromClientBuffer buffer types */
+#define EGL_OPENVG_IMAGE		0x3096
+
+/* QueryContext targets */
+#define EGL_CONTEXT_CLIENT_TYPE		0x3097
+
+/* CreateContext attributes */
+#define EGL_CONTEXT_CLIENT_VERSION	0x3098
+
+/* Multisample resolution behaviors */
+#define EGL_MULTISAMPLE_RESOLVE_DEFAULT 0x309A	/* EGL_MULTISAMPLE_RESOLVE value */
+#define EGL_MULTISAMPLE_RESOLVE_BOX	0x309B	/* EGL_MULTISAMPLE_RESOLVE value */
+
+/* BindAPI/QueryAPI targets */
+#define EGL_OPENGL_ES_API		0x30A0
+#define EGL_OPENVG_API			0x30A1
+#define EGL_OPENGL_API			0x30A2
+
+/* GetCurrentSurface targets */
+#define EGL_DRAW			0x3059
+#define EGL_READ			0x305A
+
+/* WaitNative engines */
+#define EGL_CORE_NATIVE_ENGINE		0x305B
+
+/* EGL 1.2 tokens renamed for consistency in EGL 1.3 */
+#define EGL_COLORSPACE			EGL_VG_COLORSPACE
+#define EGL_ALPHA_FORMAT		EGL_VG_ALPHA_FORMAT
+#define EGL_COLORSPACE_sRGB		EGL_VG_COLORSPACE_sRGB
+#define EGL_COLORSPACE_LINEAR		EGL_VG_COLORSPACE_LINEAR
+#define EGL_ALPHA_FORMAT_NONPRE		EGL_VG_ALPHA_FORMAT_NONPRE
+#define EGL_ALPHA_FORMAT_PRE		EGL_VG_ALPHA_FORMAT_PRE
+
+/* EGL extensions must request enum blocks from the Khronos
+ * API Registrar, who maintains the enumerant registry. Submit
+ * a bug in Khronos Bugzilla against task "Registry".
+ */
+
+
+
+/* EGL Functions */
+
+EGLAPI EGLint EGLAPIENTRY eglGetError(void);
+
+EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id);
+EGLAPI EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor);
+EGLAPI EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy);
+
+EGLAPI const char * EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigs(EGLDisplay dpy, EGLConfig *configs,
+			 EGLint config_size, EGLint *num_config);
+EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list,
+			   EGLConfig *configs, EGLint config_size,
+			   EGLint *num_config);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigAttrib(EGLDisplay dpy, EGLConfig config,
+			      EGLint attribute, EGLint *value);
+
+EGLAPI EGLSurface EGLAPIENTRY eglCreateWindowSurface(EGLDisplay dpy, EGLConfig config,
+				  EGLNativeWindowType win,
+				  const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config,
+				   const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurface(EGLDisplay dpy, EGLConfig config,
+				  EGLNativePixmapType pixmap,
+				  const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySurface(EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface(EGLDisplay dpy, EGLSurface surface,
+			   EGLint attribute, EGLint *value);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api);
+EGLAPI EGLenum EGLAPIENTRY eglQueryAPI(void);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitClient(void);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseThread(void);
+
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferFromClientBuffer(
+	      EGLDisplay dpy, EGLenum buftype, EGLClientBuffer buffer,
+	      EGLConfig config, const EGLint *attrib_list);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglSurfaceAttrib(EGLDisplay dpy, EGLSurface surface,
+			    EGLint attribute, EGLint value);
+EGLAPI EGLBoolean EGLAPIENTRY eglBindTexImage(EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseTexImage(EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+
+
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapInterval(EGLDisplay dpy, EGLint interval);
+
+
+EGLAPI EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config,
+			    EGLContext share_context,
+			    const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx);
+EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw,
+			  EGLSurface read, EGLContext ctx);
+
+EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext(void);
+EGLAPI EGLSurface EGLAPIENTRY eglGetCurrentSurface(EGLint readdraw);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetCurrentDisplay(void);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryContext(EGLDisplay dpy, EGLContext ctx,
+			   EGLint attribute, EGLint *value);
+
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitGL(void);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitNative(EGLint engine);
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffers(EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglCopyBuffers(EGLDisplay dpy, EGLSurface surface,
+			  EGLNativePixmapType target);
+
+/* This is a generic function pointer type, whose name indicates it must
+ * be cast to the proper type *and calling convention* before use.
+ */
+typedef void (*__eglMustCastToProperFunctionPointerType)(void);
+
+/* Now, define eglGetProcAddress using the generic function ptr. type */
+EGLAPI __eglMustCastToProperFunctionPointerType EGLAPIENTRY
+       eglGetProcAddress(const char *procname);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __egl_h_ */
diff --git a/amdocl/EGL/eglext.h b/amdocl/EGL/eglext.h
new file mode 100644
index 0000000000..2317b0cf45
--- /dev/null
+++ b/amdocl/EGL/eglext.h
@@ -0,0 +1,645 @@
+#ifndef __eglext_h_
+#define __eglext_h_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** Copyright (c) 2013 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+/*
+** This header is generated from the Khronos OpenGL / OpenGL ES XML
+** API Registry. The current version of the Registry, generator scripts
+** used to make the header, and the header can be found at
+**   http://www.opengl.org/registry/
+**
+** Khronos $Revision: 24350 $ on $Date: 2013-12-04 12:46:23 -0800 (Wed, 04 Dec 2013) $
+*/
+
+#include <EGL/eglplatform.h>
+
+#define EGL_EGLEXT_VERSION 20131204
+
+/* Generated C header for:
+ * API: egl
+ * Versions considered: .*
+ * Versions emitted: _nomatch_^
+ * Default extensions included: egl
+ * Additional extensions included: _nomatch_^
+ * Extensions removed: _nomatch_^
+ */
+
+#ifndef EGL_KHR_cl_event
+#define EGL_KHR_cl_event 1
+#define EGL_CL_EVENT_HANDLE_KHR           0x309C
+#define EGL_SYNC_CL_EVENT_KHR             0x30FE
+#define EGL_SYNC_CL_EVENT_COMPLETE_KHR    0x30FF
+#endif /* EGL_KHR_cl_event */
+
+#ifndef EGL_KHR_cl_event2
+#define EGL_KHR_cl_event2 1
+typedef void *EGLSyncKHR;
+typedef intptr_t EGLAttribKHR;
+typedef EGLSyncKHR (EGLAPIENTRYP PFNEGLCREATESYNC64KHRPROC) (EGLDisplay dpy, EGLenum type, const EGLAttribKHR *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateSync64KHR (EGLDisplay dpy, EGLenum type, const EGLAttribKHR *attrib_list);
+#endif
+#endif /* EGL_KHR_cl_event2 */
+
+#ifndef EGL_KHR_client_get_all_proc_addresses
+#define EGL_KHR_client_get_all_proc_addresses 1
+#endif /* EGL_KHR_client_get_all_proc_addresses */
+
+#ifndef EGL_KHR_config_attribs
+#define EGL_KHR_config_attribs 1
+#define EGL_CONFORMANT_KHR                0x3042
+#define EGL_VG_COLORSPACE_LINEAR_BIT_KHR  0x0020
+#define EGL_VG_ALPHA_FORMAT_PRE_BIT_KHR   0x0040
+#endif /* EGL_KHR_config_attribs */
+
+#ifndef EGL_KHR_create_context
+#define EGL_KHR_create_context 1
+#define EGL_CONTEXT_MAJOR_VERSION_KHR     0x3098
+#define EGL_CONTEXT_MINOR_VERSION_KHR     0x30FB
+#define EGL_CONTEXT_FLAGS_KHR             0x30FC
+#define EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR 0x30FD
+#define EGL_CONTEXT_OPENGL_RESET_NOTIFICATION_STRATEGY_KHR 0x31BD
+#define EGL_NO_RESET_NOTIFICATION_KHR     0x31BE
+#define EGL_LOSE_CONTEXT_ON_RESET_KHR     0x31BF
+#define EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR  0x00000001
+#define EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR 0x00000002
+#define EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR 0x00000004
+#define EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR 0x00000001
+#define EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT_KHR 0x00000002
+#define EGL_OPENGL_ES3_BIT_KHR            0x00000040
+#endif /* EGL_KHR_create_context */
+
+#ifndef EGL_KHR_fence_sync
+#define EGL_KHR_fence_sync 1
+#ifdef KHRONOS_SUPPORT_INT64
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR 0x30F0
+#define EGL_SYNC_CONDITION_KHR            0x30F8
+#define EGL_SYNC_FENCE_KHR                0x30F9
+#endif /* KHRONOS_SUPPORT_INT64 */
+#endif /* EGL_KHR_fence_sync */
+
+#ifndef EGL_KHR_get_all_proc_addresses
+#define EGL_KHR_get_all_proc_addresses 1
+#endif /* EGL_KHR_get_all_proc_addresses */
+
+#ifndef EGL_KHR_gl_renderbuffer_image
+#define EGL_KHR_gl_renderbuffer_image 1
+#define EGL_GL_RENDERBUFFER_KHR           0x30B9
+#endif /* EGL_KHR_gl_renderbuffer_image */
+
+#ifndef EGL_KHR_gl_texture_2D_image
+#define EGL_KHR_gl_texture_2D_image 1
+#define EGL_GL_TEXTURE_2D_KHR             0x30B1
+#define EGL_GL_TEXTURE_LEVEL_KHR          0x30BC
+#endif /* EGL_KHR_gl_texture_2D_image */
+
+#ifndef EGL_KHR_gl_texture_3D_image
+#define EGL_KHR_gl_texture_3D_image 1
+#define EGL_GL_TEXTURE_3D_KHR             0x30B2
+#define EGL_GL_TEXTURE_ZOFFSET_KHR        0x30BD
+#endif /* EGL_KHR_gl_texture_3D_image */
+
+#ifndef EGL_KHR_gl_texture_cubemap_image
+#define EGL_KHR_gl_texture_cubemap_image 1
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR 0x30B3
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR 0x30B4
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR 0x30B5
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR 0x30B6
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR 0x30B7
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR 0x30B8
+#endif /* EGL_KHR_gl_texture_cubemap_image */
+
+#ifndef EGL_KHR_image
+#define EGL_KHR_image 1
+typedef void *EGLImageKHR;
+#define EGL_NATIVE_PIXMAP_KHR             0x30B0
+#define EGL_NO_IMAGE_KHR                  ((EGLImageKHR)0)
+typedef EGLImageKHR (EGLAPIENTRYP PFNEGLCREATEIMAGEKHRPROC) (EGLDisplay dpy, EGLContext ctx, EGLenum target, EGLClientBuffer buffer, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYIMAGEKHRPROC) (EGLDisplay dpy, EGLImageKHR image);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLImageKHR EGLAPIENTRY eglCreateImageKHR (EGLDisplay dpy, EGLContext ctx, EGLenum target, EGLClientBuffer buffer, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyImageKHR (EGLDisplay dpy, EGLImageKHR image);
+#endif
+#endif /* EGL_KHR_image */
+
+#ifndef EGL_KHR_image_base
+#define EGL_KHR_image_base 1
+#define EGL_IMAGE_PRESERVED_KHR           0x30D2
+#endif /* EGL_KHR_image_base */
+
+#ifndef EGL_KHR_image_pixmap
+#define EGL_KHR_image_pixmap 1
+#endif /* EGL_KHR_image_pixmap */
+
+#ifndef EGL_KHR_lock_surface
+#define EGL_KHR_lock_surface 1
+#define EGL_READ_SURFACE_BIT_KHR          0x0001
+#define EGL_WRITE_SURFACE_BIT_KHR         0x0002
+#define EGL_LOCK_SURFACE_BIT_KHR          0x0080
+#define EGL_OPTIMAL_FORMAT_BIT_KHR        0x0100
+#define EGL_MATCH_FORMAT_KHR              0x3043
+#define EGL_FORMAT_RGB_565_EXACT_KHR      0x30C0
+#define EGL_FORMAT_RGB_565_KHR            0x30C1
+#define EGL_FORMAT_RGBA_8888_EXACT_KHR    0x30C2
+#define EGL_FORMAT_RGBA_8888_KHR          0x30C3
+#define EGL_MAP_PRESERVE_PIXELS_KHR       0x30C4
+#define EGL_LOCK_USAGE_HINT_KHR           0x30C5
+#define EGL_BITMAP_POINTER_KHR            0x30C6
+#define EGL_BITMAP_PITCH_KHR              0x30C7
+#define EGL_BITMAP_ORIGIN_KHR             0x30C8
+#define EGL_BITMAP_PIXEL_RED_OFFSET_KHR   0x30C9
+#define EGL_BITMAP_PIXEL_GREEN_OFFSET_KHR 0x30CA
+#define EGL_BITMAP_PIXEL_BLUE_OFFSET_KHR  0x30CB
+#define EGL_BITMAP_PIXEL_ALPHA_OFFSET_KHR 0x30CC
+#define EGL_BITMAP_PIXEL_LUMINANCE_OFFSET_KHR 0x30CD
+#define EGL_LOWER_LEFT_KHR                0x30CE
+#define EGL_UPPER_LEFT_KHR                0x30CF
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLLOCKSURFACEKHRPROC) (EGLDisplay dpy, EGLSurface surface, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLUNLOCKSURFACEKHRPROC) (EGLDisplay dpy, EGLSurface surface);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglLockSurfaceKHR (EGLDisplay dpy, EGLSurface surface, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglUnlockSurfaceKHR (EGLDisplay dpy, EGLSurface surface);
+#endif
+#endif /* EGL_KHR_lock_surface */
+
+#ifndef EGL_KHR_lock_surface2
+#define EGL_KHR_lock_surface2 1
+#define EGL_BITMAP_PIXEL_SIZE_KHR         0x3110
+#endif /* EGL_KHR_lock_surface2 */
+
+#ifndef EGL_KHR_lock_surface3
+#define EGL_KHR_lock_surface3 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSURFACE64KHRPROC) (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLAttribKHR *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface64KHR (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLAttribKHR *value);
+#endif
+#endif /* EGL_KHR_lock_surface3 */
+
+#ifndef EGL_KHR_reusable_sync
+#define EGL_KHR_reusable_sync 1
+typedef khronos_utime_nanoseconds_t EGLTimeKHR;
+#ifdef KHRONOS_SUPPORT_INT64
+#define EGL_SYNC_STATUS_KHR               0x30F1
+#define EGL_SIGNALED_KHR                  0x30F2
+#define EGL_UNSIGNALED_KHR                0x30F3
+#define EGL_TIMEOUT_EXPIRED_KHR           0x30F5
+#define EGL_CONDITION_SATISFIED_KHR       0x30F6
+#define EGL_SYNC_TYPE_KHR                 0x30F7
+#define EGL_SYNC_REUSABLE_KHR             0x30FA
+#define EGL_SYNC_FLUSH_COMMANDS_BIT_KHR   0x0001
+#define EGL_FOREVER_KHR                   0xFFFFFFFFFFFFFFFFull
+#define EGL_NO_SYNC_KHR                   ((EGLSyncKHR)0)
+typedef EGLSyncKHR (EGLAPIENTRYP PFNEGLCREATESYNCKHRPROC) (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync);
+typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSIGNALSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateSyncKHR (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySyncKHR (EGLDisplay dpy, EGLSyncKHR sync);
+EGLAPI EGLint EGLAPIENTRY eglClientWaitSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
+EGLAPI EGLBoolean EGLAPIENTRY eglSignalSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttribKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
+#endif
+#endif /* KHRONOS_SUPPORT_INT64 */
+#endif /* EGL_KHR_reusable_sync */
+
+#ifndef EGL_KHR_stream
+#define EGL_KHR_stream 1
+typedef void *EGLStreamKHR;
+typedef khronos_uint64_t EGLuint64KHR;
+#ifdef KHRONOS_SUPPORT_INT64
+#define EGL_NO_STREAM_KHR                 ((EGLStreamKHR)0)
+#define EGL_CONSUMER_LATENCY_USEC_KHR     0x3210
+#define EGL_PRODUCER_FRAME_KHR            0x3212
+#define EGL_CONSUMER_FRAME_KHR            0x3213
+#define EGL_STREAM_STATE_KHR              0x3214
+#define EGL_STREAM_STATE_CREATED_KHR      0x3215
+#define EGL_STREAM_STATE_CONNECTING_KHR   0x3216
+#define EGL_STREAM_STATE_EMPTY_KHR        0x3217
+#define EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR 0x3218
+#define EGL_STREAM_STATE_OLD_FRAME_AVAILABLE_KHR 0x3219
+#define EGL_STREAM_STATE_DISCONNECTED_KHR 0x321A
+#define EGL_BAD_STREAM_KHR                0x321B
+#define EGL_BAD_STATE_KHR                 0x321C
+typedef EGLStreamKHR (EGLAPIENTRYP PFNEGLCREATESTREAMKHRPROC) (EGLDisplay dpy, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSTREAMKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSTREAMATTRIBKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLint value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSTREAMKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLint *value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSTREAMU64KHRPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLuint64KHR *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLStreamKHR EGLAPIENTRY eglCreateStreamKHR (EGLDisplay dpy, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyStreamKHR (EGLDisplay dpy, EGLStreamKHR stream);
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamAttribKHR (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLint value);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryStreamKHR (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLint *value);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryStreamu64KHR (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLuint64KHR *value);
+#endif
+#endif /* KHRONOS_SUPPORT_INT64 */
+#endif /* EGL_KHR_stream */
+
+#ifndef EGL_KHR_stream_consumer_gltexture
+#define EGL_KHR_stream_consumer_gltexture 1
+#ifdef EGL_KHR_stream
+#define EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR 0x321E
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSTREAMCONSUMERACQUIREKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSTREAMCONSUMERRELEASEKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamConsumerGLTextureExternalKHR (EGLDisplay dpy, EGLStreamKHR stream);
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamConsumerAcquireKHR (EGLDisplay dpy, EGLStreamKHR stream);
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamConsumerReleaseKHR (EGLDisplay dpy, EGLStreamKHR stream);
+#endif
+#endif /* EGL_KHR_stream */
+#endif /* EGL_KHR_stream_consumer_gltexture */
+
+#ifndef EGL_KHR_stream_cross_process_fd
+#define EGL_KHR_stream_cross_process_fd 1
+typedef int EGLNativeFileDescriptorKHR;
+#ifdef EGL_KHR_stream
+#define EGL_NO_FILE_DESCRIPTOR_KHR        ((EGLNativeFileDescriptorKHR)(-1))
+typedef EGLNativeFileDescriptorKHR (EGLAPIENTRYP PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream);
+typedef EGLStreamKHR (EGLAPIENTRYP PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC) (EGLDisplay dpy, EGLNativeFileDescriptorKHR file_descriptor);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLNativeFileDescriptorKHR EGLAPIENTRY eglGetStreamFileDescriptorKHR (EGLDisplay dpy, EGLStreamKHR stream);
+EGLAPI EGLStreamKHR EGLAPIENTRY eglCreateStreamFromFileDescriptorKHR (EGLDisplay dpy, EGLNativeFileDescriptorKHR file_descriptor);
+#endif
+#endif /* EGL_KHR_stream */
+#endif /* EGL_KHR_stream_cross_process_fd */
+
+#ifndef EGL_KHR_stream_fifo
+#define EGL_KHR_stream_fifo 1
+#ifdef EGL_KHR_stream
+#define EGL_STREAM_FIFO_LENGTH_KHR        0x31FC
+#define EGL_STREAM_TIME_NOW_KHR           0x31FD
+#define EGL_STREAM_TIME_CONSUMER_KHR      0x31FE
+#define EGL_STREAM_TIME_PRODUCER_KHR      0x31FF
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSTREAMTIMEKHRPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLTimeKHR *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryStreamTimeKHR (EGLDisplay dpy, EGLStreamKHR stream, EGLenum attribute, EGLTimeKHR *value);
+#endif
+#endif /* EGL_KHR_stream */
+#endif /* EGL_KHR_stream_fifo */
+
+#ifndef EGL_KHR_stream_producer_aldatalocator
+#define EGL_KHR_stream_producer_aldatalocator 1
+#ifdef EGL_KHR_stream
+#endif /* EGL_KHR_stream */
+#endif /* EGL_KHR_stream_producer_aldatalocator */
+
+#ifndef EGL_KHR_stream_producer_eglsurface
+#define EGL_KHR_stream_producer_eglsurface 1
+#ifdef EGL_KHR_stream
+#define EGL_STREAM_BIT_KHR                0x0800
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATESTREAMPRODUCERSURFACEKHRPROC) (EGLDisplay dpy, EGLConfig config, EGLStreamKHR stream, const EGLint *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSurface EGLAPIENTRY eglCreateStreamProducerSurfaceKHR (EGLDisplay dpy, EGLConfig config, EGLStreamKHR stream, const EGLint *attrib_list);
+#endif
+#endif /* EGL_KHR_stream */
+#endif /* EGL_KHR_stream_producer_eglsurface */
+
+#ifndef EGL_KHR_surfaceless_context
+#define EGL_KHR_surfaceless_context 1
+#endif /* EGL_KHR_surfaceless_context */
+
+#ifndef EGL_KHR_vg_parent_image
+#define EGL_KHR_vg_parent_image 1
+#define EGL_VG_PARENT_IMAGE_KHR           0x30BA
+#endif /* EGL_KHR_vg_parent_image */
+
+#ifndef EGL_KHR_wait_sync
+#define EGL_KHR_wait_sync 1
+typedef EGLint (EGLAPIENTRYP PFNEGLWAITSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLint EGLAPIENTRY eglWaitSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags);
+#endif
+#endif /* EGL_KHR_wait_sync */
+
+#ifndef EGL_ANDROID_blob_cache
+#define EGL_ANDROID_blob_cache 1
+typedef khronos_ssize_t EGLsizeiANDROID;
+typedef void (*EGLSetBlobFuncANDROID) (const void *key, EGLsizeiANDROID keySize, const void *value, EGLsizeiANDROID valueSize);
+typedef EGLsizeiANDROID (*EGLGetBlobFuncANDROID) (const void *key, EGLsizeiANDROID keySize, void *value, EGLsizeiANDROID valueSize);
+typedef void (EGLAPIENTRYP PFNEGLSETBLOBCACHEFUNCSANDROIDPROC) (EGLDisplay dpy, EGLSetBlobFuncANDROID set, EGLGetBlobFuncANDROID get);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI void EGLAPIENTRY eglSetBlobCacheFuncsANDROID (EGLDisplay dpy, EGLSetBlobFuncANDROID set, EGLGetBlobFuncANDROID get);
+#endif
+#endif /* EGL_ANDROID_blob_cache */
+
+#ifndef EGL_ANDROID_framebuffer_target
+#define EGL_ANDROID_framebuffer_target 1
+#define EGL_FRAMEBUFFER_TARGET_ANDROID    0x3147
+#endif /* EGL_ANDROID_framebuffer_target */
+
+#ifndef EGL_ANDROID_image_native_buffer
+#define EGL_ANDROID_image_native_buffer 1
+#define EGL_NATIVE_BUFFER_ANDROID         0x3140
+#endif /* EGL_ANDROID_image_native_buffer */
+
+#ifndef EGL_ANDROID_native_fence_sync
+#define EGL_ANDROID_native_fence_sync 1
+#define EGL_SYNC_NATIVE_FENCE_ANDROID     0x3144
+#define EGL_SYNC_NATIVE_FENCE_FD_ANDROID  0x3145
+#define EGL_SYNC_NATIVE_FENCE_SIGNALED_ANDROID 0x3146
+#define EGL_NO_NATIVE_FENCE_FD_ANDROID    -1
+typedef EGLint (EGLAPIENTRYP PFNEGLDUPNATIVEFENCEFDANDROIDPROC) (EGLDisplay dpy, EGLSyncKHR sync);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLint EGLAPIENTRY eglDupNativeFenceFDANDROID (EGLDisplay dpy, EGLSyncKHR sync);
+#endif
+#endif /* EGL_ANDROID_native_fence_sync */
+
+#ifndef EGL_ANDROID_recordable
+#define EGL_ANDROID_recordable 1
+#define EGL_RECORDABLE_ANDROID            0x3142
+#endif /* EGL_ANDROID_recordable */
+
+#ifndef EGL_ANGLE_d3d_share_handle_client_buffer
+#define EGL_ANGLE_d3d_share_handle_client_buffer 1
+#define EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE 0x3200
+#endif /* EGL_ANGLE_d3d_share_handle_client_buffer */
+
+#ifndef EGL_ANGLE_query_surface_pointer
+#define EGL_ANGLE_query_surface_pointer 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSURFACEPOINTERANGLEPROC) (EGLDisplay dpy, EGLSurface surface, EGLint attribute, void **value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurfacePointerANGLE (EGLDisplay dpy, EGLSurface surface, EGLint attribute, void **value);
+#endif
+#endif /* EGL_ANGLE_query_surface_pointer */
+
+#ifndef EGL_ANGLE_surface_d3d_texture_2d_share_handle
+#define EGL_ANGLE_surface_d3d_texture_2d_share_handle 1
+#endif /* EGL_ANGLE_surface_d3d_texture_2d_share_handle */
+
+#ifndef EGL_ARM_pixmap_multisample_discard
+#define EGL_ARM_pixmap_multisample_discard 1
+#define EGL_DISCARD_SAMPLES_ARM           0x3286
+#endif /* EGL_ARM_pixmap_multisample_discard */
+
+#ifndef EGL_EXT_buffer_age
+#define EGL_EXT_buffer_age 1
+#define EGL_BUFFER_AGE_EXT                0x313D
+#endif /* EGL_EXT_buffer_age */
+
+#ifndef EGL_EXT_client_extensions
+#define EGL_EXT_client_extensions 1
+#endif /* EGL_EXT_client_extensions */
+
+#ifndef EGL_EXT_create_context_robustness
+#define EGL_EXT_create_context_robustness 1
+#define EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT 0x30BF
+#define EGL_CONTEXT_OPENGL_RESET_NOTIFICATION_STRATEGY_EXT 0x3138
+#define EGL_NO_RESET_NOTIFICATION_EXT     0x31BE
+#define EGL_LOSE_CONTEXT_ON_RESET_EXT     0x31BF
+#endif /* EGL_EXT_create_context_robustness */
+
+#ifndef EGL_EXT_image_dma_buf_import
+#define EGL_EXT_image_dma_buf_import 1
+#define EGL_LINUX_DMA_BUF_EXT             0x3270
+#define EGL_LINUX_DRM_FOURCC_EXT          0x3271
+#define EGL_DMA_BUF_PLANE0_FD_EXT         0x3272
+#define EGL_DMA_BUF_PLANE0_OFFSET_EXT     0x3273
+#define EGL_DMA_BUF_PLANE0_PITCH_EXT      0x3274
+#define EGL_DMA_BUF_PLANE1_FD_EXT         0x3275
+#define EGL_DMA_BUF_PLANE1_OFFSET_EXT     0x3276
+#define EGL_DMA_BUF_PLANE1_PITCH_EXT      0x3277
+#define EGL_DMA_BUF_PLANE2_FD_EXT         0x3278
+#define EGL_DMA_BUF_PLANE2_OFFSET_EXT     0x3279
+#define EGL_DMA_BUF_PLANE2_PITCH_EXT      0x327A
+#define EGL_YUV_COLOR_SPACE_HINT_EXT      0x327B
+#define EGL_SAMPLE_RANGE_HINT_EXT         0x327C
+#define EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT 0x327D
+#define EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT 0x327E
+#define EGL_ITU_REC601_EXT                0x327F
+#define EGL_ITU_REC709_EXT                0x3280
+#define EGL_ITU_REC2020_EXT               0x3281
+#define EGL_YUV_FULL_RANGE_EXT            0x3282
+#define EGL_YUV_NARROW_RANGE_EXT          0x3283
+#define EGL_YUV_CHROMA_SITING_0_EXT       0x3284
+#define EGL_YUV_CHROMA_SITING_0_5_EXT     0x3285
+#endif /* EGL_EXT_image_dma_buf_import */
+
+#ifndef EGL_EXT_multiview_window
+#define EGL_EXT_multiview_window 1
+#define EGL_MULTIVIEW_VIEW_COUNT_EXT      0x3134
+#endif /* EGL_EXT_multiview_window */
+
+#ifndef EGL_EXT_platform_base
+#define EGL_EXT_platform_base 1
+typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETPLATFORMDISPLAYEXTPROC) (EGLenum platform, void *native_display, const EGLint *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPLATFORMWINDOWSURFACEEXTPROC) (EGLDisplay dpy, EGLConfig config, void *native_window, const EGLint *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPLATFORMPIXMAPSURFACEEXTPROC) (EGLDisplay dpy, EGLConfig config, void *native_pixmap, const EGLint *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLDisplay EGLAPIENTRY eglGetPlatformDisplayEXT (EGLenum platform, void *native_display, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformWindowSurfaceEXT (EGLDisplay dpy, EGLConfig config, void *native_window, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurfaceEXT (EGLDisplay dpy, EGLConfig config, void *native_pixmap, const EGLint *attrib_list);
+#endif
+#endif /* EGL_EXT_platform_base */
+
+#ifndef EGL_EXT_platform_wayland
+#define EGL_EXT_platform_wayland 1
+#define EGL_PLATFORM_WAYLAND_EXT          0x31D8
+#endif /* EGL_EXT_platform_wayland */
+
+#ifndef EGL_EXT_platform_x11
+#define EGL_EXT_platform_x11 1
+#define EGL_PLATFORM_X11_EXT              0x31D5
+#define EGL_PLATFORM_X11_SCREEN_EXT       0x31D6
+#endif /* EGL_EXT_platform_x11 */
+
+#ifndef EGL_EXT_swap_buffers_with_damage
+#define EGL_EXT_swap_buffers_with_damage 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSWITHDAMAGEEXTPROC) (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersWithDamageEXT (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#endif
+#endif /* EGL_EXT_swap_buffers_with_damage */
+
+#ifndef EGL_HI_clientpixmap
+#define EGL_HI_clientpixmap 1
+struct EGLClientPixmapHI {
+    void  *pData;
+    EGLint iWidth;
+    EGLint iHeight;
+    EGLint iStride;
+};
+#define EGL_CLIENT_PIXMAP_POINTER_HI      0x8F74
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPIXMAPSURFACEHIPROC) (EGLDisplay dpy, EGLConfig config, struct EGLClientPixmapHI *pixmap);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurfaceHI (EGLDisplay dpy, EGLConfig config, struct EGLClientPixmapHI *pixmap);
+#endif
+#endif /* EGL_HI_clientpixmap */
+
+#ifndef EGL_HI_colorformats
+#define EGL_HI_colorformats 1
+#define EGL_COLOR_FORMAT_HI               0x8F70
+#define EGL_COLOR_RGB_HI                  0x8F71
+#define EGL_COLOR_RGBA_HI                 0x8F72
+#define EGL_COLOR_ARGB_HI                 0x8F73
+#endif /* EGL_HI_colorformats */
+
+#ifndef EGL_IMG_context_priority
+#define EGL_IMG_context_priority 1
+#define EGL_CONTEXT_PRIORITY_LEVEL_IMG    0x3100
+#define EGL_CONTEXT_PRIORITY_HIGH_IMG     0x3101
+#define EGL_CONTEXT_PRIORITY_MEDIUM_IMG   0x3102
+#define EGL_CONTEXT_PRIORITY_LOW_IMG      0x3103
+#endif /* EGL_IMG_context_priority */
+
+#ifndef EGL_MESA_drm_image
+#define EGL_MESA_drm_image 1
+#define EGL_DRM_BUFFER_FORMAT_MESA        0x31D0
+#define EGL_DRM_BUFFER_USE_MESA           0x31D1
+#define EGL_DRM_BUFFER_FORMAT_ARGB32_MESA 0x31D2
+#define EGL_DRM_BUFFER_MESA               0x31D3
+#define EGL_DRM_BUFFER_STRIDE_MESA        0x31D4
+#define EGL_DRM_BUFFER_USE_SCANOUT_MESA   0x00000001
+#define EGL_DRM_BUFFER_USE_SHARE_MESA     0x00000002
+typedef EGLImageKHR (EGLAPIENTRYP PFNEGLCREATEDRMIMAGEMESAPROC) (EGLDisplay dpy, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLEXPORTDRMIMAGEMESAPROC) (EGLDisplay dpy, EGLImageKHR image, EGLint *name, EGLint *handle, EGLint *stride);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLImageKHR EGLAPIENTRY eglCreateDRMImageMESA (EGLDisplay dpy, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglExportDRMImageMESA (EGLDisplay dpy, EGLImageKHR image, EGLint *name, EGLint *handle, EGLint *stride);
+#endif
+#endif /* EGL_MESA_drm_image */
+
+#ifndef EGL_MESA_platform_gbm
+#define EGL_MESA_platform_gbm 1
+#define EGL_PLATFORM_GBM_MESA             0x31D7
+#endif /* EGL_MESA_platform_gbm */
+
+#ifndef EGL_NV_3dvision_surface
+#define EGL_NV_3dvision_surface 1
+#define EGL_AUTO_STEREO_NV                0x3136
+#endif /* EGL_NV_3dvision_surface */
+
+#ifndef EGL_NV_coverage_sample
+#define EGL_NV_coverage_sample 1
+#define EGL_COVERAGE_BUFFERS_NV           0x30E0
+#define EGL_COVERAGE_SAMPLES_NV           0x30E1
+#endif /* EGL_NV_coverage_sample */
+
+#ifndef EGL_NV_coverage_sample_resolve
+#define EGL_NV_coverage_sample_resolve 1
+#define EGL_COVERAGE_SAMPLE_RESOLVE_NV    0x3131
+#define EGL_COVERAGE_SAMPLE_RESOLVE_DEFAULT_NV 0x3132
+#define EGL_COVERAGE_SAMPLE_RESOLVE_NONE_NV 0x3133
+#endif /* EGL_NV_coverage_sample_resolve */
+
+#ifndef EGL_NV_depth_nonlinear
+#define EGL_NV_depth_nonlinear 1
+#define EGL_DEPTH_ENCODING_NV             0x30E2
+#define EGL_DEPTH_ENCODING_NONE_NV        0
+#define EGL_DEPTH_ENCODING_NONLINEAR_NV   0x30E3
+#endif /* EGL_NV_depth_nonlinear */
+
+#ifndef EGL_NV_native_query
+#define EGL_NV_native_query 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYNATIVEDISPLAYNVPROC) (EGLDisplay dpy, EGLNativeDisplayType *display_id);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYNATIVEWINDOWNVPROC) (EGLDisplay dpy, EGLSurface surf, EGLNativeWindowType *window);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYNATIVEPIXMAPNVPROC) (EGLDisplay dpy, EGLSurface surf, EGLNativePixmapType *pixmap);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryNativeDisplayNV (EGLDisplay dpy, EGLNativeDisplayType *display_id);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryNativeWindowNV (EGLDisplay dpy, EGLSurface surf, EGLNativeWindowType *window);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryNativePixmapNV (EGLDisplay dpy, EGLSurface surf, EGLNativePixmapType *pixmap);
+#endif
+#endif /* EGL_NV_native_query */
+
+#ifndef EGL_NV_post_convert_rounding
+#define EGL_NV_post_convert_rounding 1
+#endif /* EGL_NV_post_convert_rounding */
+
+#ifndef EGL_NV_post_sub_buffer
+#define EGL_NV_post_sub_buffer 1
+#define EGL_POST_SUB_BUFFER_SUPPORTED_NV  0x30BE
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLPOSTSUBBUFFERNVPROC) (EGLDisplay dpy, EGLSurface surface, EGLint x, EGLint y, EGLint width, EGLint height);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglPostSubBufferNV (EGLDisplay dpy, EGLSurface surface, EGLint x, EGLint y, EGLint width, EGLint height);
+#endif
+#endif /* EGL_NV_post_sub_buffer */
+
+#ifndef EGL_NV_stream_sync
+#define EGL_NV_stream_sync 1
+#define EGL_SYNC_NEW_FRAME_NV             0x321F
+typedef EGLSyncKHR (EGLAPIENTRYP PFNEGLCREATESTREAMSYNCNVPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLenum type, const EGLint *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateStreamSyncNV (EGLDisplay dpy, EGLStreamKHR stream, EGLenum type, const EGLint *attrib_list);
+#endif
+#endif /* EGL_NV_stream_sync */
+
+#ifndef EGL_NV_sync
+#define EGL_NV_sync 1
+typedef void *EGLSyncNV;
+typedef khronos_utime_nanoseconds_t EGLTimeNV;
+#ifdef KHRONOS_SUPPORT_INT64
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE_NV 0x30E6
+#define EGL_SYNC_STATUS_NV                0x30E7
+#define EGL_SIGNALED_NV                   0x30E8
+#define EGL_UNSIGNALED_NV                 0x30E9
+#define EGL_SYNC_FLUSH_COMMANDS_BIT_NV    0x0001
+#define EGL_FOREVER_NV                    0xFFFFFFFFFFFFFFFFull
+#define EGL_ALREADY_SIGNALED_NV           0x30EA
+#define EGL_TIMEOUT_EXPIRED_NV            0x30EB
+#define EGL_CONDITION_SATISFIED_NV        0x30EC
+#define EGL_SYNC_TYPE_NV                  0x30ED
+#define EGL_SYNC_CONDITION_NV             0x30EE
+#define EGL_SYNC_FENCE_NV                 0x30EF
+#define EGL_NO_SYNC_NV                    ((EGLSyncNV)0)
+typedef EGLSyncNV (EGLAPIENTRYP PFNEGLCREATEFENCESYNCNVPROC) (EGLDisplay dpy, EGLenum condition, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCNVPROC) (EGLSyncNV sync);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLFENCENVPROC) (EGLSyncNV sync);
+typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCNVPROC) (EGLSyncNV sync, EGLint flags, EGLTimeNV timeout);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSIGNALSYNCNVPROC) (EGLSyncNV sync, EGLenum mode);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBNVPROC) (EGLSyncNV sync, EGLint attribute, EGLint *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSyncNV EGLAPIENTRY eglCreateFenceSyncNV (EGLDisplay dpy, EGLenum condition, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySyncNV (EGLSyncNV sync);
+EGLAPI EGLBoolean EGLAPIENTRY eglFenceNV (EGLSyncNV sync);
+EGLAPI EGLint EGLAPIENTRY eglClientWaitSyncNV (EGLSyncNV sync, EGLint flags, EGLTimeNV timeout);
+EGLAPI EGLBoolean EGLAPIENTRY eglSignalSyncNV (EGLSyncNV sync, EGLenum mode);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttribNV (EGLSyncNV sync, EGLint attribute, EGLint *value);
+#endif
+#endif /* KHRONOS_SUPPORT_INT64 */
+#endif /* EGL_NV_sync */
+
+#ifndef EGL_NV_system_time
+#define EGL_NV_system_time 1
+typedef khronos_utime_nanoseconds_t EGLuint64NV;
+#ifdef KHRONOS_SUPPORT_INT64
+typedef EGLuint64NV (EGLAPIENTRYP PFNEGLGETSYSTEMTIMEFREQUENCYNVPROC) (void);
+typedef EGLuint64NV (EGLAPIENTRYP PFNEGLGETSYSTEMTIMENVPROC) (void);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeFrequencyNV (void);
+EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeNV (void);
+#endif
+#endif /* KHRONOS_SUPPORT_INT64 */
+#endif /* EGL_NV_system_time */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/amdocl/EGL/eglplatform.h b/amdocl/EGL/eglplatform.h
new file mode 100644
index 0000000000..3ab8844f09
--- /dev/null
+++ b/amdocl/EGL/eglplatform.h
@@ -0,0 +1,125 @@
+#ifndef __eglplatform_h_
+#define __eglplatform_h_
+
+/*
+** Copyright (c) 2007-2013 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+/* Platform-specific types and definitions for egl.h
+ * $Revision: 23432 $ on $Date: 2013-10-09 00:57:24 -0700 (Wed, 09 Oct 2013) $
+ *
+ * Adopters may modify khrplatform.h and this file to suit their platform.
+ * You are encouraged to submit all modifications to the Khronos group so that
+ * they can be included in future versions of this file.  Please submit changes
+ * by sending them to the public Khronos Bugzilla (http://khronos.org/bugzilla)
+ * by filing a bug against product "EGL" component "Registry".
+ */
+
+#include <KHR/khrplatform.h>
+
+/* Macros used in EGL function prototype declarations.
+ *
+ * EGL functions should be prototyped as:
+ *
+ * EGLAPI return-type EGLAPIENTRY eglFunction(arguments);
+ * typedef return-type (EXPAPIENTRYP PFNEGLFUNCTIONPROC) (arguments);
+ *
+ * KHRONOS_APICALL and KHRONOS_APIENTRY are defined in KHR/khrplatform.h
+ */
+
+#ifndef EGLAPI
+#define EGLAPI KHRONOS_APICALL
+#endif
+
+#ifndef EGLAPIENTRY
+#define EGLAPIENTRY  KHRONOS_APIENTRY
+#endif
+#define EGLAPIENTRYP EGLAPIENTRY*
+
+/* The types NativeDisplayType, NativeWindowType, and NativePixmapType
+ * are aliases of window-system-dependent types, such as X Display * or
+ * Windows Device Context. They must be defined in platform-specific
+ * code below. The EGL-prefixed versions of Native*Type are the same
+ * types, renamed in EGL 1.3 so all types in the API start with "EGL".
+ *
+ * Khronos STRONGLY RECOMMENDS that you use the default definitions
+ * provided below, since these changes affect both binary and source
+ * portability of applications using EGL running on different EGL
+ * implementations.
+ */
+
+#if defined(_WIN32) || defined(__VC32__) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__) /* Win32 and WinCE */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+
+typedef HDC     EGLNativeDisplayType;
+typedef HBITMAP EGLNativePixmapType;
+typedef HWND    EGLNativeWindowType;
+
+#elif defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
+
+typedef int   EGLNativeDisplayType;
+typedef void *EGLNativeWindowType;
+typedef void *EGLNativePixmapType;
+
+#elif defined(__ANDROID__) || defined(ANDROID)
+
+#include <android/native_window.h>
+
+struct egl_native_pixmap_t;
+
+typedef struct ANativeWindow*           EGLNativeWindowType;
+typedef struct egl_native_pixmap_t*     EGLNativePixmapType;
+typedef void*                           EGLNativeDisplayType;
+
+#elif defined(__unix__)
+
+/* X11 (tentative)  */
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+typedef Display *EGLNativeDisplayType;
+typedef Pixmap   EGLNativePixmapType;
+typedef Window   EGLNativeWindowType;
+
+#else
+#error "Platform not recognized"
+#endif
+
+/* EGL 1.2 types, renamed for consistency in EGL 1.3 */
+typedef EGLNativeDisplayType NativeDisplayType;
+typedef EGLNativePixmapType  NativePixmapType;
+typedef EGLNativeWindowType  NativeWindowType;
+
+
+/* Define EGLint. This must be a signed integral type large enough to contain
+ * all legal attribute names and values passed into and out of EGL, whether
+ * their type is boolean, bitmask, enumerant (symbolic constant), integer,
+ * handle, or other.  While in general a 32-bit integer will suffice, if
+ * handles are 64 bit types, then EGLint should be defined as a signed 64-bit
+ * integer type.
+ */
+typedef khronos_int32_t EGLint;
+
+#endif /* __eglplatform_h */
diff --git a/amdocl/KHR/khrplatform.h b/amdocl/KHR/khrplatform.h
new file mode 100644
index 0000000000..c9e6f17d34
--- /dev/null
+++ b/amdocl/KHR/khrplatform.h
@@ -0,0 +1,282 @@
+#ifndef __khrplatform_h_
+#define __khrplatform_h_
+
+/*
+** Copyright (c) 2008-2009 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+/* Khronos platform-specific types and definitions.
+ *
+ * $Revision: 23298 $ on $Date: 2013-09-30 17:07:13 -0700 (Mon, 30 Sep 2013) $
+ *
+ * Adopters may modify this file to suit their platform. Adopters are
+ * encouraged to submit platform specific modifications to the Khronos
+ * group so that they can be included in future versions of this file.
+ * Please submit changes by sending them to the public Khronos Bugzilla
+ * (http://khronos.org/bugzilla) by filing a bug against product
+ * "Khronos (general)" component "Registry".
+ *
+ * A predefined template which fills in some of the bug fields can be
+ * reached using http://tinyurl.com/khrplatform-h-bugreport, but you
+ * must create a Bugzilla login first.
+ *
+ *
+ * See the Implementer's Guidelines for information about where this file
+ * should be located on your system and for more details of its use:
+ *    http://www.khronos.org/registry/implementers_guide.pdf
+ *
+ * This file should be included as
+ *        #include <KHR/khrplatform.h>
+ * by Khronos client API header files that use its types and defines.
+ *
+ * The types in khrplatform.h should only be used to define API-specific types.
+ *
+ * Types defined in khrplatform.h:
+ *    khronos_int8_t              signed   8  bit
+ *    khronos_uint8_t             unsigned 8  bit
+ *    khronos_int16_t             signed   16 bit
+ *    khronos_uint16_t            unsigned 16 bit
+ *    khronos_int32_t             signed   32 bit
+ *    khronos_uint32_t            unsigned 32 bit
+ *    khronos_int64_t             signed   64 bit
+ *    khronos_uint64_t            unsigned 64 bit
+ *    khronos_intptr_t            signed   same number of bits as a pointer
+ *    khronos_uintptr_t           unsigned same number of bits as a pointer
+ *    khronos_ssize_t             signed   size
+ *    khronos_usize_t             unsigned size
+ *    khronos_float_t             signed   32 bit floating point
+ *    khronos_time_ns_t           unsigned 64 bit time in nanoseconds
+ *    khronos_utime_nanoseconds_t unsigned time interval or absolute time in
+ *                                         nanoseconds
+ *    khronos_stime_nanoseconds_t signed time interval in nanoseconds
+ *    khronos_boolean_enum_t      enumerated boolean type. This should
+ *      only be used as a base type when a client API's boolean type is
+ *      an enum. Client APIs which use an integer or other type for
+ *      booleans cannot use this as the base type for their boolean.
+ *
+ * Tokens defined in khrplatform.h:
+ *
+ *    KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
+ *
+ *    KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
+ *    KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
+ *
+ * Calling convention macros defined in this file:
+ *    KHRONOS_APICALL
+ *    KHRONOS_APIENTRY
+ *    KHRONOS_APIATTRIBUTES
+ *
+ * These may be used in function prototypes as:
+ *
+ *      KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
+ *                                  int arg1,
+ *                                  int arg2) KHRONOS_APIATTRIBUTES;
+ */
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APICALL
+ *-------------------------------------------------------------------------
+ * This precedes the return type of the function in the function prototype.
+ */
+#if defined(_WIN32) && !defined(__SCITECH_SNAP__)
+#   define KHRONOS_APICALL __declspec(dllimport)
+#elif defined (__SYMBIAN32__)
+#   define KHRONOS_APICALL IMPORT_C
+#else
+#   define KHRONOS_APICALL
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIENTRY
+ *-------------------------------------------------------------------------
+ * This follows the return type of the function  and precedes the function
+ * name in the function prototype.
+ */
+#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
+    /* Win32 but not WinCE */
+#   define KHRONOS_APIENTRY __stdcall
+#else
+#   define KHRONOS_APIENTRY
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIATTRIBUTES
+ *-------------------------------------------------------------------------
+ * This follows the closing parenthesis of the function prototype arguments.
+ */
+#if defined (__ARMCC_2__)
+#define KHRONOS_APIATTRIBUTES __softfp
+#else
+#define KHRONOS_APIATTRIBUTES
+#endif
+
+/*-------------------------------------------------------------------------
+ * basic type definitions
+ *-----------------------------------------------------------------------*/
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
+
+
+/*
+ * Using <stdint.h>
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(__VMS ) || defined(__sgi)
+
+/*
+ * Using <inttypes.h>
+ */
+#include <inttypes.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
+
+/*
+ * Win32
+ */
+typedef __int32                 khronos_int32_t;
+typedef unsigned __int32        khronos_uint32_t;
+typedef __int64                 khronos_int64_t;
+typedef unsigned __int64        khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(__sun__) || defined(__digital__)
+
+/*
+ * Sun or Digital
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#if defined(__arch64__) || defined(_LP64)
+typedef long int                khronos_int64_t;
+typedef unsigned long int       khronos_uint64_t;
+#else
+typedef long long int           khronos_int64_t;
+typedef unsigned long long int  khronos_uint64_t;
+#endif /* __arch64__ */
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif 0
+
+/*
+ * Hypothetical platform with no float or int64 support
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#define KHRONOS_SUPPORT_INT64   0
+#define KHRONOS_SUPPORT_FLOAT   0
+
+#else
+
+/*
+ * Generic fallback
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#endif
+
+
+/*
+ * Types that are (so far) the same on all platforms
+ */
+typedef signed   char          khronos_int8_t;
+typedef unsigned char          khronos_uint8_t;
+typedef signed   short int     khronos_int16_t;
+typedef unsigned short int     khronos_uint16_t;
+
+/*
+ * Types that differ between LLP64 and LP64 architectures - in LLP64, 
+ * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
+ * to be the only LLP64 architecture in current use.
+ */
+#ifdef _WIN64
+typedef signed   long long int khronos_intptr_t;
+typedef unsigned long long int khronos_uintptr_t;
+typedef signed   long long int khronos_ssize_t;
+typedef unsigned long long int khronos_usize_t;
+#else
+typedef signed   long  int     khronos_intptr_t;
+typedef unsigned long  int     khronos_uintptr_t;
+typedef signed   long  int     khronos_ssize_t;
+typedef unsigned long  int     khronos_usize_t;
+#endif
+
+#if KHRONOS_SUPPORT_FLOAT
+/*
+ * Float type
+ */
+typedef          float         khronos_float_t;
+#endif
+
+#if KHRONOS_SUPPORT_INT64
+/* Time types
+ *
+ * These types can be used to represent a time interval in nanoseconds or
+ * an absolute Unadjusted System Time.  Unadjusted System Time is the number
+ * of nanoseconds since some arbitrary system event (e.g. since the last
+ * time the system booted).  The Unadjusted System Time is an unsigned
+ * 64 bit value that wraps back to 0 every 584 years.  Time intervals
+ * may be either signed or unsigned.
+ */
+typedef khronos_uint64_t       khronos_utime_nanoseconds_t;
+typedef khronos_int64_t        khronos_stime_nanoseconds_t;
+#endif
+
+/*
+ * Dummy value used to pad enum types to 32 bits.
+ */
+#ifndef KHRONOS_MAX_ENUM
+#define KHRONOS_MAX_ENUM 0x7FFFFFFF
+#endif
+
+/*
+ * Enumerated boolean type
+ *
+ * Values other than zero should be considered to be true.  Therefore
+ * comparisons should not be made against KHRONOS_TRUE.
+ */
+typedef enum {
+    KHRONOS_FALSE = 0,
+    KHRONOS_TRUE  = 1,
+    KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
+} khronos_boolean_enum_t;
+
+#endif /* __khrplatform_h_ */
diff --git a/amdocl/cl_common.hpp b/amdocl/cl_common.hpp
new file mode 100644
index 0000000000..a88a06f498
--- /dev/null
+++ b/amdocl/cl_common.hpp
@@ -0,0 +1,301 @@
+/* Copyright (c) 2008-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CL_COMMON_HPP_
+#define CL_COMMON_HPP_
+
+#include "top.hpp"
+#include "platform/runtime.hpp"
+#include "platform/command.hpp"
+#include "platform/memory.hpp"
+#include "thread/thread.hpp"
+#include "platform/commandqueue.hpp"
+
+#include <vector>
+#include <utility>
+
+//! \cond ignore
+namespace amd {
+
+template <typename T>
+class NotNullWrapper
+{
+private:
+    T* const ptrOrNull_;
+
+protected:
+    explicit NotNullWrapper(T* ptrOrNull)
+        : ptrOrNull_(ptrOrNull)
+    { }
+
+public:
+    void operator = (T value) const
+    {
+        if (ptrOrNull_ != NULL) {
+            *ptrOrNull_ = value;
+        }
+    }
+};
+
+template <typename T>
+class NotNullReference : protected NotNullWrapper<T>
+{
+public:
+    explicit NotNullReference(T* ptrOrNull)
+        : NotNullWrapper<T>(ptrOrNull)
+    { }
+
+    const NotNullWrapper<T>& operator * () const { return *this; }
+};
+
+} // namespace amd
+
+template <typename T>
+inline amd::NotNullReference<T>
+not_null(T* ptrOrNull)
+{
+    return amd::NotNullReference<T>(ptrOrNull);
+}
+
+#define CL_CHECK_THREAD(thread)                                              \
+    (thread != NULL || ((thread = new amd::HostThread()) != NULL             \
+            && thread == amd::Thread::current()))
+
+#define RUNTIME_ENTRY_RET(ret, func, args)                                   \
+CL_API_ENTRY ret CL_API_CALL                                                 \
+func args                                                                    \
+{                                                                            \
+    amd::Thread* thread = amd::Thread::current();                            \
+    if (!CL_CHECK_THREAD(thread)) {                                          \
+        *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;                      \
+        return (ret) 0;                                                      \
+    }
+
+#define RUNTIME_ENTRY_RET_NOERRCODE(ret, func, args)                         \
+CL_API_ENTRY ret CL_API_CALL                                                 \
+func args                                                                    \
+{                                                                            \
+    amd::Thread* thread = amd::Thread::current();                            \
+    if (!CL_CHECK_THREAD(thread)) {                                          \
+        return (ret) 0;                                                      \
+    }
+
+#define RUNTIME_ENTRY(ret, func, args)                                       \
+CL_API_ENTRY ret CL_API_CALL                                                 \
+func args                                                                    \
+{                                                                            \
+    amd::Thread* thread = amd::Thread::current();                            \
+    if (!CL_CHECK_THREAD(thread)) {                                          \
+        return CL_OUT_OF_HOST_MEMORY;                                        \
+    }
+
+#define RUNTIME_ENTRY_VOID(ret, func, args)                                  \
+CL_API_ENTRY ret CL_API_CALL                                                 \
+func args                                                                    \
+{                                                                            \
+    amd::Thread* thread = amd::Thread::current();                            \
+    if (!CL_CHECK_THREAD(thread)) {                                          \
+        return;                                                              \
+    }
+
+#define RUNTIME_EXIT                                                         \
+    /* FIXME_lmoriche: we should check to thread->lastError here! */         \
+}
+
+//! Helper function to check "properties" parameter in various functions
+int checkContextProperties(
+    const cl_context_properties *properties,
+    bool*   offlineDevices);
+
+namespace amd {
+
+namespace detail {
+
+template <typename T>
+struct ParamInfo
+{
+    static inline std::pair<const void*, size_t> get(const T& param) {
+        return std::pair<const void*, size_t>(&param, sizeof(T));
+    }
+};
+
+template <>
+struct ParamInfo<const char*>
+{
+    static inline std::pair<const void*, size_t> get(const char* param) {
+        return std::pair<const void*, size_t>(param, strlen(param) + 1);
+    }
+};
+
+template <int N>
+struct ParamInfo<char[N]>
+{
+    static inline std::pair<const void*, size_t> get(const char* param) {
+        return std::pair<const void*, size_t>(param, strlen(param) + 1);
+    }
+};
+
+} // namespace detail
+
+template <typename T>
+static inline cl_int
+clGetInfo(
+    T& field,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret)
+{
+    const void *valuePtr;
+    size_t valueSize;
+
+    std::tie(valuePtr, valueSize)
+        = detail::ParamInfo<typename std::remove_const<T>::type>::get(field);
+
+    *not_null(param_value_size_ret) = valueSize;
+
+    cl_int ret = CL_SUCCESS;
+    if (param_value != NULL && param_value_size < valueSize) {
+        if (!std::is_pointer<T>() || !std::is_same<typename std::remove_const<
+                typename std::remove_pointer<T>::type>::type, char>()) {
+            return CL_INVALID_VALUE;
+        }
+	// For char* and char[] params, we will at least fill up to
+        // param_value_size, then return an error.
+        valueSize = param_value_size;
+        static_cast<char*>(param_value)[--valueSize] = '\0';
+        ret = CL_INVALID_VALUE;
+    }
+
+    if (param_value != NULL) {
+        ::memcpy(param_value, valuePtr, valueSize);
+        if (param_value_size > valueSize) {
+            ::memset(static_cast<address>(param_value) + valueSize,
+                '\0', param_value_size - valueSize);
+        }
+    }
+
+    return ret;
+}
+
+static inline cl_int
+clSetEventWaitList(
+    Command::EventWaitList& eventWaitList,
+    const amd::HostQueue& hostQueue,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list)
+{
+    if ((num_events_in_wait_list == 0 && event_wait_list != NULL)
+            || (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
+        return CL_INVALID_EVENT_WAIT_LIST;
+    }
+
+    while (num_events_in_wait_list-- > 0) {
+        cl_event event = *event_wait_list++;
+        Event* amdEvent = as_amd(event);
+        if (!is_valid(event)) {
+            return CL_INVALID_EVENT_WAIT_LIST;
+        }
+        if (&hostQueue.context() != &amdEvent->context()) {
+            return CL_INVALID_CONTEXT;
+        }
+        if ((amdEvent->command().queue() != &hostQueue) && !amdEvent->notifyCmdQueue()) {
+            return CL_INVALID_EVENT_WAIT_LIST;
+        }
+        eventWaitList.push_back(amdEvent);
+    }
+    return CL_SUCCESS;
+}
+
+//! Common function declarations for CL-external graphics API interop
+cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue,
+    cl_uint num_objects, const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+    cl_event* event, cl_command_type cmd_type);
+cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue,
+    cl_uint num_objects, const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+    cl_event* event, cl_command_type cmd_type);
+
+// This may need moving somewhere tidier...
+
+struct PlatformIDS { const struct KHRicdVendorDispatchRec* dispatch_; };
+class PlatformID {
+public:
+    static PlatformIDS Platform;
+};
+#define AMD_PLATFORM (reinterpret_cast<cl_platform_id>(&amd::PlatformID::Platform))
+
+} // namespace amd
+
+extern "C" {
+
+extern CL_API_ENTRY cl_key_amd CL_API_CALL
+clCreateKeyAMD(
+    cl_platform_id platform,
+    void (CL_CALLBACK * destructor)( void * ),
+    cl_int * errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clObjectGetValueForKeyAMD(
+    void * object,
+    cl_key_amd key,
+    void ** ret_val);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clObjectSetValueForKeyAMD(
+    void * object,
+    cl_key_amd key,
+    void * value);
+
+#if defined(CL_VERSION_1_1)
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty(
+    cl_command_queue command_queue,
+    cl_command_queue_properties properties,
+    cl_bool enable,
+    cl_command_queue_properties *old_properties) CL_API_SUFFIX__VERSION_1_0;
+#endif // CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clConvertImageAMD(
+    cl_context              context,
+    cl_mem                  image,
+    const cl_image_format * image_format,
+    cl_int *                errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromImageAMD(
+    cl_context              context,
+    cl_mem                  image,
+    cl_int *                errcode_ret);
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithAssemblyAMD(
+    cl_context              context,
+    cl_uint                 count,
+    const char **           strings,
+    const size_t *          lengths,
+    cl_int *                errcode_ret);
+
+} // extern "C"
+
+//! \endcond
+
+#endif /*CL_COMMON_HPP_*/
diff --git a/amdocl/cl_debugger_amd.h b/amdocl/cl_debugger_amd.h
new file mode 100644
index 0000000000..1e9fe29e3a
--- /dev/null
+++ b/amdocl/cl_debugger_amd.h
@@ -0,0 +1,694 @@
+/* Copyright (c) 2014-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __CL_DEBUGGER_AMD_H
+#define __CL_DEBUGGER_AMD_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif
+
+/******************************************
+* Private AMD extension cl_dbg *
+******************************************/
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+#define CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD -80
+#define CL_DEBUGGER_REGISTER_FAILURE_AMD -81
+#define CL_TRAP_HANDLER_NOT_DEFINED_AMD -82
+#define CL_EVENT_TIMEOUT_AMD -83
+
+
+typedef uintptr_t cl_dbg_event_amd;  //! debug event
+
+/*!  \brief  Trap Handler Type
+ *
+ *   The trap handler for each support type.
+ */
+enum cl_dbg_trap_type_amd {
+  CL_DBG_DEBUG_TRAP = 0,  //! HW debug
+  CL_DBG_MAX_TRAP
+};
+
+/*!  \brief  Wave actions used to control the wave execution on the hardware
+ *
+ *   The wave action enumerations are used to specify the desired
+ *   behavior when calling the wave control function. Overall, there are
+ *   five types of operations that can be specified.
+ */
+enum cl_dbg_waves_action_amd {
+  CL_DBG_WAVES_DONT_USE_ZERO = 0,  //! NOT USED
+  CL_DBG_WAVES_HALT = 1,           //! halt wave
+  CL_DBG_WAVES_RESUME = 2,         //! resume wave
+  CL_DBG_WAVES_KILL = 3,           //! kill wave
+  CL_DBG_WAVES_DEBUG = 4,          //! debug wave
+  CL_DBG_WAVES_TRAP = 5,           //! trap
+  CL_DBG_WAVES_MAX
+};
+
+/*!  \brief  Host actions when encountering an exception in the kernel.
+ *
+ *   The host action enumeration is used to specify the desired host
+ *   response in the event thatn a device kernel exception is encountered.
+ */
+enum cl_dbg_host_action_amd {
+  CL_DBG_HOST_IGNORE = 1,  //! ignore the kernel exception
+  CL_DBG_HOST_EXIT = 2,    //! exit the host application on a kernel exception
+  CL_DBG_HOST_NOTIFY = 4   //! report the kernel exception
+};
+
+/*!  \brief  Mode of the wave action when calling the wave control function
+ *
+ *   The wave mode enumerations are used to specify the desired
+ *   broadcast level when calling the wave control function.
+ */
+enum cl_dbg_wave_mode_amd {
+  CL_DBG_WAVEMODE_SINGLE = 0,        //! send command to single wave
+  CL_DBG_WAVEMODE_BROADCAST = 2,     //! send command to wave with match VMID
+  CL_DBG_WAVEMODE_BROADCAST_CU = 3,  //! send command to wave with match VMID with specific CU
+  CL_DBG_WAVEMODE_MAX
+};
+
+/*!  \brief  Enumeration of address watch mode
+ *
+ *   This enumeration indicates the different modes of address watch.
+ */
+enum cl_dbg_address_watch_mode_amd {
+  CL_DBG_ADDR_WATCH_MODE_READ = 0,     //! Read operations only
+  CL_DBG_ADDR_WATCH_MODE_NONREAD = 1,  //! Write or Atomic operations only
+  CL_DBG_ADDR_WATCH_MODE_ATOMIC = 2,   //! Atomic Operations only
+  CL_DBG_ADDR_WATCH_MODE_ALL = 3,      //! Read, Write or Atomic operations
+  CL_DBG_ADDR_WATCH_MODE_MAX           //! Number of address watch modes
+};
+
+/*!  \brief  Dispatch exception policy descriptor
+ *
+ *   The dispatch exception policy descriptor is used to define the
+ *   expected exception policy in the event an exception is encountered
+ *   on the associated dispatch.
+ */
+typedef struct _cl_dbg_exception_policy_amd {
+  cl_uint exceptionMask;               //! exception mask
+  cl_dbg_waves_action_amd waveAction;  //! wave action
+  cl_dbg_host_action_amd hostAction;   //! host action
+  cl_dbg_wave_mode_amd waveMode;       //! wave mode
+} cl_dbg_exception_policy_amd;
+
+/*!  \brief  Kernel execution mode
+ *
+ *   This structure is used to control the kernel execution mode. The
+ *   following aspects are included in this structure:
+ *   1. Regular execution or debug mode (0: regular execution (default),
+ *                                       1: debug mode)
+ *   2. SQ debugger mode on/off
+ *   3. Disable L1 scalar cache (0: enable (default), 1: disable)
+ *   4. Disable L1 vector cache (0: enable (default), 1: disable)
+ *   5. Disable L2 cache (0: enable (default), 1: disable)
+ *   6. Num of CUs reserved for display (0 (default), 7: max)
+ */
+typedef struct _cl_dbg_kernel_exec_mode_amd {
+  union {
+    struct {
+      cl_uint monitorMode : 1;
+      cl_uint gpuSingleStepMode : 1;
+      cl_uint disableL1Scalar : 1;
+      cl_uint disableL1Vector : 1;
+      cl_uint disableL2Cache : 1;
+      cl_uint reservedCuNum : 3;
+      cl_uint reserved : 24;
+    };
+    cl_uint ui32All;
+  };
+} cl_dbg_kernel_exec_mode_amd;
+
+/*!  \brief  GPU cache mask
+ *
+ *   This structure is used to specify the GPU cache to be flushed/invalidated
+ */
+typedef struct _cl_dbg_gpu_cache_mask_amd {
+  union {
+    struct {
+      cl_uint sqICache : 1;  //! instruction cache
+      cl_uint sqKCache : 1;  //! data cache
+      cl_uint tcL1 : 1;      //! tcL1 cache
+      cl_uint tcL2 : 1;      //! tcL2 cache
+      cl_uint reserved : 28;
+    };
+    cl_uint ui32All;
+  };
+} cl_dbg_gpu_cache_mask_amd;
+
+/*!  \brief  Dispatch Debug Info
+ *
+ *   This structure is used to store the scratch and global memory descriptors
+ */
+typedef struct _cl_dispatch_debug_info_amd {
+  cl_uint scratchMemoryDescriptor[4];  //! Scratch memory descriptors
+  cl_uint globalMemoryDescriptor[4];   //! Global memory descriptors
+} cl_dispatch_debug_info_amd;
+
+/*!  \brief AQL Packet Info
+ *
+ *   This structure is used to store AQL packet informatin for kernel dispatch
+ */
+typedef struct _cl_aql_packet_info_amd {
+  cl_uint trapReservedVgprIndex;    //! VGPR index reserved for trap
+                                    //!   value is -1 when kernel was not compiled
+                                    //!   in debug mode.
+  cl_uint scratchBufferWaveOffset;  //! scratch buffer wave offset
+                                    //!   value is -1 when kernel was not compiled
+                                    //!   in debug mode  or scratch buffer is not enabled
+  void* pointerToIsaBuffer;         //! Pointer to buffer containing ISA
+  size_t sizeOfIsaBuffer;           //! Size of the ISA buffer
+
+  cl_uint numberOfVgprs;           //! Number of VGPRs used by the kernel
+  cl_uint numberOfSgprs;           //! Number of SGPRs used by the kernel
+  size_t sizeOfStaticGroupMemory;  //! Static local memory used by the kernel
+} cl_aql_packet_info_amd;
+
+/*!  \brief  Wave address
+ *
+ *   This structure specifies the wave for the SQ control command
+ */
+typedef struct _cl_dbg_wave_addr_amd {
+  cl_uint shaderEngine : 2;  //! Shader engine
+  cl_uint shaderArray : 1;   //! Shader array
+  cl_uint computeUnit : 4;   //! Compute unit
+  cl_uint simd : 2;          //! SIMD id
+  cl_uint wave : 4;          //! Wave id
+  cl_uint vmid : 4;          //! VMID
+  cl_uint reserved : 15;
+
+} cl_dbg_wave_addr_amd;
+
+/*!  \brief Pre-dispatch call back function signature
+ *
+ *   This is the signature of the call back fuction before the kernel
+ *   dispatch. The call back function is to indicate the start of the
+ *   the kernel launch. It is used by the debugger.
+ */
+typedef void* (*cl_PreDispatchCallBackFunctionAMD)(cl_device_id device, void* ocl_event_handle,
+                                                   const void* aql_packet, void* acl_binary,
+                                                   void* user_args);
+
+/*!  \brief Post-dispatch call back function signature
+ *
+ *    This is the signature of the call back fuction after the kernel
+ *    dispatch. The call back function is to indicate the completion of
+ *    the the kernel launch. It is used by the debugger.
+ */
+typedef void* (*cl_PostDispatchCallBackFunctionAMD)(cl_device_id device, cl_ulong event,
+                                                    void* user_args);
+
+/*! \brief Set up the dispatch call back function pointers
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param preDispatchFunction  is the function to be called before dispatching the kernel
+ *
+ *  \param postDispatchFunction  is the function to be called after kernel execution
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetCallBackFunctionsAMD(
+    cl_device_id /* device */, cl_PreDispatchCallBackFunctionAMD /* preDispatchFunction */,
+    cl_PostDispatchCallBackFunctionAMD /* postDispatchFunction */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Set up the arguments of the dispatch call back function
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param preDispatchArgs  is the arguments for the pre-dispatch callback function
+ *
+ *  \param postDispatchArgs  is the arguments for the post-dispatch callback function
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetCallBackArgumentsAMD(cl_device_id /* device */,
+                                                                      void* /* preDispatchArgs */,
+                                                                      void* /* postDispatchArgs */
+                                                                      ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Invalidate all cache on the device.
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param mask    is the mask to specify which cache to be flush/invalidate
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgFlushCacheAMD(cl_device_id /* device */,
+                                                            cl_dbg_gpu_cache_mask_amd /* mask */
+                                                            ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Set up an exception policy in the trap handler object
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param policy  specifies the exception policy, which includes the exception mask,
+ *                 wave action, host action, wave mode.
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the policy is not specified (NULL)
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetExceptionPolicyAMD(
+    cl_device_id /* device */, cl_dbg_exception_policy_amd* /* policy */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Get the exception policy in the trap handler object
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param policy  is a pointer to the memory where the policy is returned
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the policy storage is not specified
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgGetExceptionPolicyAMD(
+    cl_device_id /* device */, cl_dbg_exception_policy_amd* /* policy */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Set up the kernel execution mode in the trap handler object
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param mode    specifies the kernel execution mode, which indicate whether single
+ *                 step mode is used, how many CUs are reserved.
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the mode is not specified, ie, has a NULL value
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetKernelExecutionModeAMD(
+    cl_device_id /* device */, cl_dbg_kernel_exec_mode_amd* /* mode */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Get the kernel execution mode in the trap handler object
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param mode    is a pointer to the memory where the exectuion mode is returned
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the mode storage is not specified, ie, has a NULL value
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgGetKernelExecutionModeAMD(
+    cl_device_id /* device */, cl_dbg_kernel_exec_mode_amd* /* mode */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Create a debug event
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param autoReset   is the auto reset flag
+ *
+ *  \param pDebugEvent returns the debug event to be used for exception notification
+ *
+ *  \param pEventId    is the event ID, which is not used at this moment
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the function is executed successfully
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the pDebugEvent value is NULL
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ *  - CL_OUT_OF_RESOURCES if fails to create the event
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgCreateEventAMD(cl_device_id /* device */,
+                                                             bool /* autoReset */,
+                                                             cl_dbg_event_amd* /* pDebugEvent */,
+                                                             cl_uint* /* pEventId */
+                                                             ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Wait for a debug event to be signaled
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param pDebugEvent is the debug event to be waited for
+ *
+ *  \param pEventId    is the event ID, which is not used at this moment
+ *
+ *  \param timeOut     is the duration for waiting
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the pDebugEvent value is NULL
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ *  - CL_EVENT_TIMEOUT_AMD if timeout occurs
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgWaitEventAMD(cl_device_id /* device */,
+                                                           cl_dbg_event_amd /* pDebugEvent */,
+                                                           cl_uint /* pEventId */,
+                                                           cl_uint /* timeOut */
+                                                           ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Destroy a debug event
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param pDebugEvent is the debug event to be waited for
+ *
+ *  \param pEventId    is the event ID, which is not used at this moment
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the pDebugEvent value is NULL
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgDestroyEventAMD(cl_device_id /* device */,
+                                                              cl_dbg_event_amd* /* pDebugEvent */,
+                                                              cl_uint* /* pEventId */
+                                                              ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Register the debugger on a device
+ *
+ *  \param context specifies the context for the debugger
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param pMessageStorge specifies the memory for trap message passing between KMD and OCL runtime
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_CONTEXT if the context is not valid
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the pMEssageStorge value is NULL
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ *  - CL_OUT_OF_RESOURCES if a host queue cannot be created for the debugger
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgRegisterDebuggerAMD(
+    cl_context /* context */, cl_device_id /* device */, volatile void* /* pMessageStorage */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Unregister the debugger on a device
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgUnregisterDebuggerAMD(cl_device_id /* device */
+                                                                    ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Setup the pointer of the acl_binary to be used by the debugger
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param aclBinary  specifies the ACL binary to be used
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the aclBinary is not provided
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetAclBinaryAMD(cl_device_id /* device */,
+                                                              void* /* aclBinary */
+                                                              ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Control the execution of wavefront on the GPU
+ *
+ *  \param device       specifies the device to be used
+ *
+ *  \param action       specifies the wave action - halt, resume, kill, debug
+ *
+ *  \param mode         specifies the wave mode
+ *
+ *  \param trapID       specifies the trap ID, which should be 0x7
+ *
+ *  \param waveAddress  specifies the wave address for the wave control
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the waveMsg is not provided, invalid action or mode value
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgWaveControlAMD(cl_device_id /* device */,
+                                                             cl_dbg_waves_action_amd /* action */,
+                                                             cl_dbg_wave_mode_amd /* mode */,
+                                                             cl_uint /* trapId */,
+                                                             cl_dbg_wave_addr_amd /* waveAddress */
+                                                             ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Set watch points on memory address ranges to generate exception events
+ *
+ *  \param device          specifies the device to be used
+ *
+ *  \param numWatchPoints  specifies the number of watch points
+ *
+ *  \param watchMode       is the array of watch mode for the watch points
+ *
+ *  \param watchAddress    is the array of watch address for the watch points
+ *
+ *  \param watchMask       is the array of mask for the watch points
+ *
+ *  \param watchEvent      is the array of event for the watch points
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the number of points <= 0, or other parameters is not specified
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgAddressWatchAMD(
+    cl_device_id /* device */, cl_uint /* numWatchPoints */,
+    cl_dbg_address_watch_mode_amd* /* watchMode */, void** /* watchAddress */,
+    cl_ulong* /* watchMask */, cl_dbg_event_amd* /* watchEvent */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Get the packaet information for kernel execution
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param aqlCodeInfo   specifies the kernel code and its size
+ *
+ *  \param packetInfo    points to the memory for the packet information to be returned
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgGetAqlPacketInfoAMD(
+    cl_device_id /* device */, const void* /* aqlCodeInfo */,
+    cl_aql_packet_info_amd* /* packetInfo */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Get the dispatch debug information
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param debugInfo  points to the memory for the debug information to be returned
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgGetDispatchDebugInfoAMD(
+    cl_device_id /* device */, cl_dispatch_debug_info_amd* /* debugInfo */
+    ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Map the video memory for the kernel code to allow host access
+ *
+ *  \param device          specifies the device to be used
+ *
+ *  \param aqlCodeAddress  is the memory points to the returned host memory address for the kernel
+ * code
+ *
+ *  \param aqlCodeSize     returns the size of the kernel code
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgMapKernelCodeAMD(cl_device_id /* device */,
+                                                               void* /* aqlCodeInfo */
+                                                               ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Unmap the video memory for the kernel code
+ *
+ *  \param device          specifies the device to be used (no needed, just to be consistent)
+ *
+ *  \param aqlCodeAddress  is the memory points to the mapped memory address for the kernel code
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgUnmapKernelCodeAMD(cl_device_id /* device */,
+                                                                 cl_ulong* /* aqlCodeAddress */
+                                                                 ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Map the shader scratch ring's video memory to allow CPU access
+ *
+ *  \param device  specifies the device to be used
+ *
+ *  \param scratchRingAddr  is the memory points to the returned host memory address for scratch
+ * ring
+ *
+ *  \param scratchRingSize  returns the size of the scratch ring
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgMapScratchRingAMD(cl_device_id /* device */,
+                                                                cl_ulong* /* scratchRingAddr */,
+                                                                cl_uint* /* scratchRingSize */
+                                                                ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Unmap the shader scratch ring's video memory
+ *
+ *  \param device           specifies the device to be used (no needed, just to be consistent)
+ *
+ *  \param scratchRingAddr  is the memory points to the mapped memory address for scratch ring
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgUnmapScratchRingAMD(cl_device_id /* device */,
+                                                                  cl_ulong* /* scratchRingAddr */
+                                                                  ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Get the memory object associated with the kernel parameter
+ *
+ *  \param device     specifies the device to be used
+ *
+ *  \param paramIdx   is the index of of the kernel argument
+ *
+ *  \param paramMem   is pointer of the memory associated with the kernel argument to be returned
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if the paramIdx is less than zero, or the paramMem has NULL value
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ *  - CL_INVALID_KERNEL_ARGS if it fails to get the memory object for the kernel argument
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgGetKernelParamMemAMD(cl_device_id /* devicepointer */,
+                                                                   cl_uint /* paramIdx */,
+                                                                   cl_mem* /* paramMem */
+                                                                   ) CL_API_SUFFIX__VERSION_2_0;
+
+/*! \brief Set value of a global memory object
+ *
+ *  \param device      specifies the device to be used
+ *
+ *  \param memObject   is the memory object handle to be assigned the value specified in srcMem.
+ *
+ *  \param offset      is offset of the memory object
+ *
+ *  \param srcMem      points to the memory which contains the values to be assigned to the memory
+ *
+ *  \param size        size (in bytes) of the srcMem
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if memObj or srcPtr has NULL value, size <= 0 or offset < 0
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgSetGlobalMemoryAMD(cl_device_id /* device */,
+                                                                 cl_mem /* memObject */,
+                                                                 cl_uint /* offset */,
+                                                                 void* /* srcMem */,
+                                                                 cl_uint /* size */
+                                                                 ) CL_API_SUFFIX__VERSION_2_0;
+
+
+/*! \brief Install the trap handler of a given type
+ *
+ *  \param device      specifies the device to be used
+ *
+ *  \param trapType    is the type of trap handler
+ *
+ *  \param trapHandler is the pointer of trap handler (TBA)
+ *
+ *  \param trapBuffer  is the pointer of trap handler buffer (TMA)
+ *
+ *  \param trapHandlerSize   size (in bytes) of the trap handler
+ *
+ *  \param trapBufferSize    size (in bytes) of the trap handler buffer
+ *
+ *  \return One of the following values:
+ *  - CL_SUCCESS if the event occurs before the timeout
+ *  - CL_INVALID_DEVICE if the device is not valid
+ *  - CL_INVALID_VALUE if trapHandler is NULL or trapHandlerSize <= 0
+ *  - CL_HWDBG_MANAGER_NOT_AVAILABLE_AMD if there is no HW DEBUG manager
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clHwDbgInstallTrapAMD(cl_device_id /* device */,
+                                                             cl_dbg_trap_type_amd /* trapType */,
+                                                             cl_mem /* trapHandler */,
+                                                             cl_mem /* trapBuffer */
+                                                             ) CL_API_SUFFIX__VERSION_2_0;
+
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif /*__cplusplus*/
+
+#endif /*__CL_DEBUGGER_AMD_H*/
diff --git a/amdocl/cl_icd.cpp b/amdocl/cl_icd.cpp
new file mode 100644
index 0000000000..ec2cb48d7d
--- /dev/null
+++ b/amdocl/cl_icd.cpp
@@ -0,0 +1,293 @@
+/* Copyright (c) 2008-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "cl_common.hpp"
+#include "vdi_common.hpp"
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "cl_d3d9_amd.hpp"
+#include "cl_d3d10_amd.hpp"
+#include "cl_d3d11_amd.hpp"
+#endif  //_WIN32
+
+#include <icd/loader/icd_dispatch.h>
+
+#include <mutex>
+
+amd::PlatformIDS amd::PlatformID::Platform =  //{ NULL };
+    {amd::ICDDispatchedObject::icdVendorDispatch_};
+
+static cl_int CL_API_CALL icdGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                                             size_t param_value_size, void* param_value,
+                                             size_t* param_value_size_ret) {
+  return clGetPlatformInfo(NULL, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+static cl_int CL_API_CALL icdGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
+                                          cl_uint num_entries, cl_device_id* devices,
+                                          cl_uint* num_devices) {
+  return clGetDeviceIDs(NULL, device_type, num_entries, devices, num_devices);
+}
+
+static cl_int CL_API_CALL icdGetDeviceInfo(cl_device_id device, cl_device_info param_name,
+                                           size_t param_value_size, void* param_value,
+                                           size_t* param_value_size_ret) {
+  if (param_name == CL_DEVICE_PLATFORM) {
+    // Return the ICD platform instead of the default NULL platform.
+    cl_platform_id platform = reinterpret_cast<cl_platform_id>(&amd::PlatformID::Platform);
+    return amd::clGetInfo(platform, param_value_size, param_value, param_value_size_ret);
+  }
+
+  return clGetDeviceInfo(device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {
+    {NULL /* should not get called */, icdGetPlatformInfo, icdGetDeviceIDs, icdGetDeviceInfo,
+     clCreateContext, clCreateContextFromType, clRetainContext, clReleaseContext, clGetContextInfo,
+     clCreateCommandQueue, clRetainCommandQueue, clReleaseCommandQueue, clGetCommandQueueInfo,
+     clSetCommandQueueProperty, clCreateBuffer, clCreateImage2D, clCreateImage3D, clRetainMemObject,
+     clReleaseMemObject, clGetSupportedImageFormats, clGetMemObjectInfo, clGetImageInfo,
+     clCreateSampler, clRetainSampler, clReleaseSampler, clGetSamplerInfo,
+     clCreateProgramWithSource, clCreateProgramWithBinary, clRetainProgram, clReleaseProgram,
+     clBuildProgram, clUnloadCompiler, clGetProgramInfo, clGetProgramBuildInfo, clCreateKernel,
+     clCreateKernelsInProgram, clRetainKernel, clReleaseKernel, clSetKernelArg, clGetKernelInfo,
+     clGetKernelWorkGroupInfo, clWaitForEvents, clGetEventInfo, clRetainEvent, clReleaseEvent,
+     clGetEventProfilingInfo, clFlush, clFinish, clEnqueueReadBuffer, clEnqueueWriteBuffer,
+     clEnqueueCopyBuffer, clEnqueueReadImage, clEnqueueWriteImage, clEnqueueCopyImage,
+     clEnqueueCopyImageToBuffer, clEnqueueCopyBufferToImage, clEnqueueMapBuffer, clEnqueueMapImage,
+     clEnqueueUnmapMemObject, clEnqueueNDRangeKernel, clEnqueueTask, clEnqueueNativeKernel,
+     clEnqueueMarker, clEnqueueWaitForEvents, clEnqueueBarrier, clGetExtensionFunctionAddress,
+     clCreateFromGLBuffer, clCreateFromGLTexture2D, clCreateFromGLTexture3D,
+     clCreateFromGLRenderbuffer, clGetGLObjectInfo, clGetGLTextureInfo, clEnqueueAcquireGLObjects,
+     clEnqueueReleaseGLObjects, clGetGLContextInfoKHR,
+     WINDOWS_SWITCH(clGetDeviceIDsFromD3D10KHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D10BufferKHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D10Texture2DKHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D10Texture3DKHR, NULL),
+     WINDOWS_SWITCH(clEnqueueAcquireD3D10ObjectsKHR, NULL),
+     WINDOWS_SWITCH(clEnqueueReleaseD3D10ObjectsKHR, NULL), clSetEventCallback, clCreateSubBuffer,
+     clSetMemObjectDestructorCallback, clCreateUserEvent, clSetUserEventStatus,
+     clEnqueueReadBufferRect, clEnqueueWriteBufferRect, clEnqueueCopyBufferRect,
+     NULL, NULL, NULL, clCreateEventFromGLsyncKHR,
+
+     /* OpenCL 1.2*/
+     clCreateSubDevices, clRetainDevice, clReleaseDevice, clCreateImage,
+     clCreateProgramWithBuiltInKernels, clCompileProgram, clLinkProgram, clUnloadPlatformCompiler,
+     clGetKernelArgInfo, clEnqueueFillBuffer, clEnqueueFillImage, clEnqueueMigrateMemObjects,
+     clEnqueueMarkerWithWaitList, clEnqueueBarrierWithWaitList,
+     clGetExtensionFunctionAddressForPlatform, clCreateFromGLTexture,
+
+     WINDOWS_SWITCH(clGetDeviceIDsFromD3D11KHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D11BufferKHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D11Texture2DKHR, NULL),
+     WINDOWS_SWITCH(clCreateFromD3D11Texture3DKHR, NULL),
+     WINDOWS_SWITCH(clCreateFromDX9MediaSurfaceKHR, NULL),
+     WINDOWS_SWITCH(clEnqueueAcquireD3D11ObjectsKHR, NULL),
+     WINDOWS_SWITCH(clEnqueueReleaseD3D11ObjectsKHR, NULL),
+
+     WINDOWS_SWITCH(clGetDeviceIDsFromDX9MediaAdapterKHR,
+                    NULL),  // KHRpfn_clGetDeviceIDsFromDX9MediaAdapterKHR
+                            // clGetDeviceIDsFromDX9MediaAdapterKHR;
+     WINDOWS_SWITCH(
+         clEnqueueAcquireDX9MediaSurfacesKHR,
+         NULL),  // KHRpfn_clEnqueueAcquireDX9MediaSurfacesKHR clEnqueueAcquireDX9MediaSurfacesKHR;
+     WINDOWS_SWITCH(
+         clEnqueueReleaseDX9MediaSurfacesKHR,
+         NULL),  // KHRpfn_clEnqueueReleaseDX9MediaSurfacesKHR clEnqueueReleaseDX9MediaSurfacesKHR;
+
+     NULL,
+     NULL, NULL, NULL,
+
+     clCreateCommandQueueWithProperties, clCreatePipe, clGetPipeInfo, clSVMAlloc, clSVMFree,
+     clEnqueueSVMFree, clEnqueueSVMMemcpy, clEnqueueSVMMemFill, clEnqueueSVMMap, clEnqueueSVMUnmap,
+     clCreateSamplerWithProperties, clSetKernelArgSVMPointer, clSetKernelExecInfo,
+     clGetKernelSubGroupInfo,
+     clCloneKernel,
+     clCreateProgramWithIL,
+     clEnqueueSVMMigrateMem,
+     clGetDeviceAndHostTimer,
+     clGetHostTimer,
+     clGetKernelSubGroupInfo,
+     clSetDefaultDeviceCommandQueue,
+
+     clSetProgramReleaseCallback,
+     clSetProgramSpecializationConstant }};
+
+#if defined(ATI_OS_WIN)
+#include <Shlwapi.h>
+
+#pragma comment(lib, "shlwapi.lib")
+
+static bool ShouldLoadPlatform() {
+  // Get the OpenCL ICD registry values
+  HKEY platformsKey = NULL;
+  if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, "SOFTWARE\\Khronos\\OpenCL\\Vendors", 0, KEY_READ,
+                    &platformsKey) != ERROR_SUCCESS)
+    return true;
+
+  std::vector<std::string> registryValues;
+  DWORD dwIndex = 0;
+  while (true) {
+    char cszLibraryName[1024] = {0};
+    DWORD dwLibraryNameSize = sizeof(cszLibraryName);
+    DWORD dwLibraryNameType = 0;
+    DWORD dwValue = 0;
+    DWORD dwValueSize = sizeof(dwValue);
+
+    if (RegEnumValueA(platformsKey, dwIndex++, cszLibraryName, &dwLibraryNameSize, NULL,
+                      &dwLibraryNameType, (LPBYTE)&dwValue, &dwValueSize) != ERROR_SUCCESS)
+      break;
+    // Require that the value be a DWORD and equal zero
+    if (dwLibraryNameType != REG_DWORD || dwValue != 0) {
+      continue;
+    }
+    registryValues.push_back(cszLibraryName);
+  }
+  RegCloseKey(platformsKey);
+
+  HMODULE hm = NULL;
+  if (!GetModuleHandleExA(
+          GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+          (LPCSTR)&ShouldLoadPlatform, &hm))
+    return true;
+
+  char cszDllPath[1024] = {0};
+  if (!GetModuleFileNameA(hm, cszDllPath, sizeof(cszDllPath))) return true;
+
+  // If we are loaded from the DriverStore, then there should be a registry
+  // value matching our current module absolute path.
+  if (std::find(registryValues.begin(), registryValues.end(), cszDllPath) == registryValues.end())
+    return true;
+
+  LPSTR cszFileName;
+  char buffer[1024] = {0};
+  if (!GetFullPathNameA(cszDllPath, sizeof(buffer), buffer, &cszFileName)) return true;
+
+  // We found an absolute path in the registry that matched this DLL, now
+  // check if there is also an entry with the same filename.
+  if (std::find(registryValues.begin(), registryValues.end(), cszFileName) == registryValues.end())
+    return true;
+
+  // Lastly, check if there is a DLL with the same name in the System folder.
+  char cszSystemPath[1024] = {0};
+#if defined(ATI_BITS_32)
+  if (!GetSystemWow64DirectoryA(cszSystemPath, sizeof(cszSystemPath)))
+#endif  // defined(ATI_BITS_32)
+    if (!GetSystemDirectoryA(cszSystemPath, sizeof(cszSystemPath))) return true;
+
+  std::string systemDllPath;
+  systemDllPath.append(cszSystemPath).append("\\").append(cszFileName);
+  if (!PathFileExistsA(systemDllPath.c_str())) {
+    return true;
+  }
+
+  // If we get here, then all 3 conditions are true:
+  // - An entry in the registry with an absolute path matches the current DLL
+  // - An entry in the registry with a relative path matches the current DLL
+  // - A DLL with the same name was found in the system directory
+  //
+  // We should not load this platform!
+
+  return false;
+}
+
+#else
+
+#include <dlfcn.h>
+
+// If there is only one platform, load it.
+// If there is more than one platform, only load platforms that have visible devices
+// If all platforms have no devices available, only load the PAL platform
+static bool ShouldLoadPlatform() {
+  bool shouldLoad = true;
+
+  if (!amd::Runtime::initialized()) {
+    amd::Runtime::init();
+  }
+  const int numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
+
+  void *otherPlatform = nullptr;
+  if (amd::IS_LEGACY) {
+    otherPlatform = dlopen("libamdocl64.so", RTLD_LAZY);
+    if (otherPlatform != nullptr) { // Present platform exists
+      shouldLoad = numDevices > 0;
+    }
+  } else {
+    otherPlatform = dlopen("libamdocl-orca64.so", RTLD_LAZY);
+    if (otherPlatform != nullptr) { // Legacy platform exists
+      // gcc4.8 doesn't support casting void* to a function pointer
+      // Work around this by creating a typedef untill we upgrade the compiler
+      typedef void*(*clGetFunctionAddress_t)(const char *);
+      typedef cl_int(*clIcdGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *);
+
+      clGetFunctionAddress_t legacyGetFunctionAddress =
+        reinterpret_cast<clGetFunctionAddress_t>(dlsym(otherPlatform, "clGetExtensionFunctionAddress"));
+      clIcdGetPlatformIDs_t legacyGetPlatformIDs =
+        reinterpret_cast<clIcdGetPlatformIDs_t>(legacyGetFunctionAddress("clIcdGetPlatformIDsKHR"));
+
+      cl_uint numLegacyPlatforms = 0;
+      legacyGetPlatformIDs(0, nullptr, &numLegacyPlatforms);
+
+      shouldLoad = (numDevices > 0) || (numLegacyPlatforms == 0);
+    }
+  }
+
+  if (otherPlatform != nullptr) {
+    dlclose(otherPlatform);
+  }
+
+  return shouldLoad;
+}
+
+#endif // defined(ATI_OS_WIN)
+
+CL_API_ENTRY cl_int CL_API_CALL clIcdGetPlatformIDsKHR(cl_uint num_entries,
+                                                       cl_platform_id* platforms,
+                                                       cl_uint* num_platforms) {
+  if (((num_entries > 0 || num_platforms == NULL) && platforms == NULL) ||
+      (num_entries == 0 && platforms != NULL)) {
+    return CL_INVALID_VALUE;
+  }
+
+  static bool shouldLoad = true;
+
+  static std::once_flag initOnce;
+  std::call_once(initOnce, [](){ shouldLoad = ShouldLoadPlatform(); });
+
+  if (!shouldLoad) {
+    *not_null(num_platforms) = 0;
+    return CL_SUCCESS;
+  }
+
+  if (!amd::Runtime::initialized()) {
+    amd::Runtime::init();
+  }
+
+  if (num_platforms != NULL && platforms == NULL) {
+    *num_platforms = 1;
+    return CL_SUCCESS;
+  }
+
+  assert(platforms != NULL && "check the code above");
+  *platforms = reinterpret_cast<cl_platform_id>(&amd::PlatformID::Platform);
+
+  *not_null(num_platforms) = 1;
+  return CL_SUCCESS;
+}
diff --git a/amdocl/cl_icd_amd.h b/amdocl/cl_icd_amd.h
new file mode 100644
index 0000000000..69408e75ac
--- /dev/null
+++ b/amdocl/cl_icd_amd.h
@@ -0,0 +1,739 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_ICD_H
+#define __OPENCL_CL_ICD_H
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+
+#define cl_khr_icd 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+typedef cl_int(CL_API_CALL* clGetPlatformIDs_fn)(
+    cl_uint /* num_entries */, cl_platform_id* /* platforms */,
+    cl_uint* /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetPlatformInfo_fn)(
+    cl_platform_id /* platform */, cl_platform_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetDeviceIDs_fn)(
+    cl_platform_id /* platform */, cl_device_type /* device_type */, cl_uint /* num_entries */,
+    cl_device_id* /* devices */, cl_uint* /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetDeviceInfo_fn)(
+    cl_device_id /* device */, cl_device_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_context(CL_API_CALL* clCreateContext_fn)(
+    const cl_context_properties* /* properties */, cl_uint /* num_devices */,
+    const cl_device_id* /* devices */,
+    void(CL_CALLBACK* /* pfn_notify */)(const char*, const void*, size_t, void*),
+    void* /* user_data */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_context(CL_API_CALL* clCreateContextFromType_fn)(
+    const cl_context_properties* /* properties */, cl_device_type /* device_type */,
+    void(CL_CALLBACK* /* pfn_notify*/)(const char*, const void*, size_t, void*),
+    void* /* user_data */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainContext_fn)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseContext_fn)(cl_context /* context */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetContextInfo_fn)(
+    cl_context /* context */, cl_context_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_command_queue(CL_API_CALL* clCreateCommandQueue_fn)(
+    cl_context /* context */, cl_device_id /* device */,
+    cl_command_queue_properties /* properties */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainCommandQueue_fn)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseCommandQueue_fn)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetCommandQueueInfo_fn)(
+    cl_command_queue /* command_queue */, cl_command_queue_info /* param_name */,
+    size_t /* param_value_size */, void* /* param_value */,
+    size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clSetCommandQueueProperty_fn)(
+    cl_command_queue /* command_queue */, cl_command_queue_properties /* properties */,
+    cl_bool /* enable */,
+    cl_command_queue_properties* /* old_properties */) /*CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED*/;
+
+typedef cl_mem(CL_API_CALL* clCreateBuffer_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */, void* /* host_ptr */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateSubBuffer_fn)(
+    cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */,
+    const void* /* buffer_create_info */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_mem(CL_API_CALL* clCreateImage2D_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format* /* image_format */,
+    size_t /* image_width */, size_t /* image_height */, size_t /* image_row_pitch */,
+    void* /* host_ptr */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateImage3D_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format* /* image_format */,
+    size_t /* image_width */, size_t /* image_height */, size_t /* image_depth */,
+    size_t /* image_row_pitch */, size_t /* image_slice_pitch */, void* /* host_ptr */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainMemObject_fn)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseMemObject_fn)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetSupportedImageFormats_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_mem_object_type /* image_type */,
+    cl_uint /* num_entries */, cl_image_format* /* image_formats */,
+    cl_uint* /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetMemObjectInfo_fn)(
+    cl_mem /* memobj */, cl_mem_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetImageInfo_fn)(
+    cl_mem /* image */, cl_image_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clSetMemObjectDestructorCallback_fn)(
+    cl_mem /* memobj */,
+    void(CL_CALLBACK* /*pfn_notify*/)(cl_mem /* memobj */, void* /*user_data*/),
+    void* /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs  */
+typedef cl_sampler(CL_API_CALL* clCreateSampler_fn)(
+    cl_context /* context */, cl_bool /* normalized_coords */,
+    cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainSampler_fn)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseSampler_fn)(cl_sampler /* sampler */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetSamplerInfo_fn)(
+    cl_sampler /* sampler */, cl_sampler_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs  */
+typedef cl_program(CL_API_CALL* clCreateProgramWithSource_fn)(
+    cl_context /* context */, cl_uint /* count */, const char** /* strings */,
+    const size_t* /* lengths */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context /* context */,
+    const void * /* strings */, size_t /* lengths */,
+    cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_2_0;
+
+typedef cl_program(CL_API_CALL* clCreateProgramWithILKHR_fn)(
+    cl_context /* context */, const void* /* il */, size_t /* length */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_program(CL_API_CALL* clCreateProgramWithBinary_fn)(
+    cl_context /* context */, cl_uint /* num_devices */, const cl_device_id* /* device_list */,
+    const size_t* /* lengths */, const unsigned char** /* binaries */, cl_int* /* binary_status */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainProgram_fn)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseProgram_fn)(cl_program /* program */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clBuildProgram_fn)(
+    cl_program /* program */, cl_uint /* num_devices */, const cl_device_id* /* device_list */,
+    const char* /* options */,
+    void(CL_CALLBACK* /* pfn_notify */)(cl_program /* program */, void* /* user_data */),
+    void* /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clUnloadCompiler_fn)(void) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetProgramInfo_fn)(
+    cl_program /* program */, cl_program_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetProgramBuildInfo_fn)(
+    cl_program /* program */, cl_device_id /* device */, cl_program_build_info /* param_name */,
+    size_t /* param_value_size */, void* /* param_value */,
+    size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+typedef cl_kernel(CL_API_CALL* clCreateKernel_fn)(
+    cl_program /* program */, const char* /* kernel_name */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clCreateKernelsInProgram_fn)(
+    cl_program /* program */, cl_uint /* num_kernels */, cl_kernel* /* kernels */,
+    cl_uint* /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clRetainKernel_fn)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseKernel_fn)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clSetKernelArg_fn)(cl_kernel /* kernel */, cl_uint /* arg_index */,
+                                               size_t /* arg_size */, const void* /* arg_value */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetKernelInfo_fn)(
+    cl_kernel /* kernel */, cl_kernel_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetKernelWorkGroupInfo_fn)(
+    cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_work_group_info /* param_name */,
+    size_t /* param_value_size */, void* /* param_value */,
+    size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs  */
+typedef cl_int(CL_API_CALL* clWaitForEvents_fn)(
+    cl_uint /* num_events */, const cl_event* /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetEventInfo_fn)(
+    cl_event /* event */, cl_event_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_event(CL_API_CALL* clCreateUserEvent_fn)(
+    cl_context /* context */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clRetainEvent_fn)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clReleaseEvent_fn)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clSetUserEventStatus_fn)(
+    cl_event /* event */, cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clSetEventCallback_fn)(
+    cl_event /* event */, cl_int /* command_exec_callback_type */,
+    void(CL_CALLBACK* /* pfn_notify */)(cl_event, cl_int, void*),
+    void* /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs  */
+typedef cl_int(CL_API_CALL* clGetEventProfilingInfo_fn)(
+    cl_event /* event */, cl_profiling_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+typedef cl_int(CL_API_CALL* clFlush_fn)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clFinish_fn)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+typedef cl_int(CL_API_CALL* clEnqueueReadBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */,
+    size_t /* offset */, size_t /* cb */, void* /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueReadBufferRect_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */,
+    const size_t* /* buffer_offset */, const size_t* /* host_offset */, const size_t* /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */,
+    size_t /* host_slice_pitch */, void* /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clEnqueueWriteBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */,
+    size_t /* offset */, size_t /* cb */, const void* /* ptr */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueWriteBufferRect_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */,
+    const size_t* /* buffer_offset */, const size_t* /* host_offset */, const size_t* /* region */,
+    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */,
+    size_t /* host_slice_pitch */, const void* /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clEnqueueCopyBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */,
+    size_t /* src_offset */, size_t /* dst_offset */, size_t /* cb */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueCopyBufferRect_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */,
+    const size_t* /* src_origin */, const size_t* /* dst_origin */, const size_t* /* region */,
+    size_t /* src_row_pitch */, size_t /* src_slice_pitch */, size_t /* dst_row_pitch */,
+    size_t /* dst_slice_pitch */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clEnqueueReadImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_read */,
+    const size_t* /* origin[3] */, const size_t* /* region[3] */, size_t /* row_pitch */,
+    size_t /* slice_pitch */, void* /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueWriteImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_write */,
+    const size_t* /* origin[3] */, const size_t* /* region[3] */, size_t /* input_row_pitch */,
+    size_t /* input_slice_pitch */, const void* /* ptr */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueCopyImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_image */,
+    const size_t* /* src_origin[3] */, const size_t* /* dst_origin[3] */,
+    const size_t* /* region[3] */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueCopyImageToBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_buffer */,
+    const size_t* /* src_origin[3] */, const size_t* /* region[3] */, size_t /* dst_offset */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueCopyBufferToImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_image */,
+    size_t /* src_offset */, const size_t* /* dst_origin[3] */, const size_t* /* region[3] */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef void*(CL_API_CALL* clEnqueueMapBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_map */,
+    cl_map_flags /* map_flags */, size_t /* offset */, size_t /* cb */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */, cl_int* /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+
+typedef void*(CL_API_CALL* clEnqueueMapImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_map */,
+    cl_map_flags /* map_flags */, const size_t* /* origin[3] */, const size_t* /* region[3] */,
+    size_t* /* image_row_pitch */, size_t* /* image_slice_pitch */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */, cl_int* /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueUnmapMemObject_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* memobj */, void* /* mapped_ptr */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueNDRangeKernel_fn)(
+    cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* work_dim */,
+    const size_t* /* global_work_offset */, const size_t* /* global_work_size */,
+    const size_t* /* local_work_size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueTask_fn)(cl_command_queue /* command_queue */,
+                                              cl_kernel /* kernel */,
+                                              cl_uint /* num_events_in_wait_list */,
+                                              const cl_event* /* event_wait_list */,
+                                              cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueNativeKernel_fn)(
+    cl_command_queue /* command_queue */, void(CL_CALLBACK* user_func)(void*), void* /* args */,
+    size_t /* cb_args */, cl_uint /* num_mem_objects */, const cl_mem* /* mem_list */,
+    const void** /* args_mem_loc */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueMarker_fn)(cl_command_queue /* command_queue */,
+                                                cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueWaitForEvents_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events */,
+    const cl_event* /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueBarrier_fn)(cl_command_queue /* command_queue */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef void*(CL_API_CALL* clGetExtensionFunctionAddress_fn)(const char* /* func_name */)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateFromGLBuffer_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLuint /* bufobj */,
+    int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateFromGLTexture2D_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
+    cl_GLint /* miplevel */, cl_GLuint /* texture */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateFromGLTexture3D_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
+    cl_GLint /* miplevel */, cl_GLuint /* texture */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem(CL_API_CALL* clCreateFromGLRenderbuffer_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLuint /* renderbuffer */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetGLObjectInfo_fn)(
+    cl_mem /* memobj */, cl_gl_object_type* /* gl_object_type */,
+    cl_GLuint* /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clGetGLTextureInfo_fn)(
+    cl_mem /* memobj */, cl_gl_texture_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_event(CL_API_CALL* clCreateEventFromGLsyncKHR_fn)(
+    cl_context /* context */, cl_GLsync /* cl_GLsync */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int(CL_API_CALL* clEnqueueAcquireGLObjects_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem* /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueReleaseGLObjects_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
+    const cl_mem* /* mem_objects */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int(CL_API_CALL* clCreateSubDevices_fn)(
+    cl_device_id /* in_device */, const cl_device_partition_property* /* properties */,
+    cl_uint /* num_entries */, cl_device_id* /* out_devices */,
+    cl_uint* /* num_devices */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clRetainDevice_fn)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clReleaseDevice_fn)(cl_device_id /* device */)
+    CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem(CL_API_CALL* clCreateImage_fn)(cl_context /* context */, cl_mem_flags /* flags */,
+                                              const cl_image_format* /* image_format*/,
+                                              const cl_image_desc* /* image_desc*/,
+                                              void* /* host_ptr */,
+                                              cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_program(CL_API_CALL* clCreateProgramWithBuiltInKernels_fn)(
+    cl_context /* context */, cl_uint /* num_devices */, const cl_device_id* /* device_list */,
+    const char* /* kernel_names */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clCompileProgram_fn)(
+    cl_program /* program */, cl_uint /* num_devices */, const cl_device_id* /* device_list */,
+    const char* /* options */, cl_uint /* num_input_headers */,
+    const cl_program* /* input_headers */, const char** /* header_include_names */,
+    void(CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+    void* /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_program(CL_API_CALL* clLinkProgram_fn)(
+    cl_context /* context */, cl_uint /* num_devices */, const cl_device_id* /* device_list */,
+    const char* /* options */, cl_uint /* num_input_programs */,
+    const cl_program* /* input_programs */,
+    void(CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* /* user_data */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clUnloadPlatformCompiler_fn)(cl_platform_id /* platform */)
+    CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clGetKernelArgInfo_fn)(
+    cl_kernel /* kernel */, cl_uint /* arg_indx */, cl_kernel_arg_info /* param_name */,
+    size_t /* param_value_size */, void* /* param_value */,
+    size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clEnqueueFillBuffer_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* buffer */, const void* /* pattern */,
+    size_t /* pattern_size */, size_t /* offset */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clEnqueueFillImage_fn)(
+    cl_command_queue /* command_queue */, cl_mem /* image */, const void* /* fill_color */,
+    const size_t* /* origin */, const size_t* /* region */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clEnqueueMigrateMemObjects_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
+    const cl_mem* /* mem_objects */, cl_mem_migration_flags /* flags */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clEnqueueMarkerWithWaitList_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int(CL_API_CALL* clEnqueueBarrierWithWaitList_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef void*(CL_API_CALL* clGetExtensionFunctionAddressForPlatform_fn)(
+    cl_platform_id /* platform */, const char* /* funcname */)CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem(CL_API_CALL* clCreateFromGLTexture_fn)(
+    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* texture_target */,
+    cl_GLint /* miplevel */, cl_GLuint /* texture */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_command_queue(CL_API_CALL* clCreateCommandQueueWithProperties_fn)(
+    cl_context /* context */, cl_device_id /* device */,
+    const cl_queue_properties* /* properties */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_sampler(CL_API_CALL* clCreateSamplerWithProperties_fn)(
+    cl_context /* context */, const cl_sampler_properties* /* properties */,
+    cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef void*(CL_API_CALL* clSVMAlloc_fn)(cl_context /* context */, cl_svm_mem_flags /* flags */,
+                                          size_t /* size */,
+                                          cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+
+typedef void(CL_API_CALL* clSVMFree_fn)(cl_context /* context */,
+                                        void* /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clSetKernelArgSVMPointer_fn)(
+    cl_kernel /* kernel */, cl_uint /*  arg_index */,
+    const void* /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clSetKernelExecInfo_fn)(
+    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, size_t /* param_value_size */,
+    const void* /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueSVMFree_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    void* [] /* svm_pointers */,
+    void(CL_CALLBACK* /* pfn_free_func */)(cl_command_queue /* queue */,
+                                           cl_uint /* num_svm_pointers */,
+                                           void* [] /* svm_pointers */, void* /* user_data */),
+    void* /* user_data */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueSVMMemcpy_fn)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, void* /* dst_ptr */,
+    const void* /* src_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueSVMMemFill_fn)(
+    cl_command_queue /* command_queue */, void* /* svm_ptr */, const void* /* pattern */,
+    size_t /* pattern_size */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueSVMMap_fn)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_map */, cl_map_flags /* flags */,
+    void* /* svm_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clEnqueueSVMUnmap_fn)(cl_command_queue /* command_queue */,
+                                                  void* /* svm_ptr */,
+                                                  cl_uint /* num_events_in_wait_list */,
+                                                  const cl_event* /* event_wait_list */,
+                                                  cl_event* /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_mem(CL_API_CALL* clCreatePipe_fn)(cl_context /* context */, cl_mem_flags /* flags */,
+                                             cl_uint /* pipe_packet_size */,
+                                             cl_uint /* pipe_max_packets */,
+                                             const cl_pipe_properties* /* properties */,
+                                             cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clGetPipeInfo_fn)(
+    cl_mem /* pipe */, cl_pipe_info /* param_name */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef cl_int(CL_API_CALL* clGetKernelSubGroupInfoKHR_fn)(
+    cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_sub_group_info /* param_name */,
+    size_t /* input_value_size */, const void* /* input_value */, size_t /* param_value_size */,
+    void* /* param_value */, size_t* /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+
+typedef cl_int(CL_API_CALL* clSetDefaultDeviceCommandQueue_fn)(
+    cl_context /* context */, cl_device_id /* device */,
+    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
+
+typedef cl_kernel(CL_API_CALL* clCloneKernel_fn)(
+    cl_kernel /* source_kernel */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+typedef cl_int (CL_API_CALL* clEnqueueSVMMigrateMem_fn)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    const void ** /* svm_pointers */, const size_t * /* sizes */,
+    cl_mem_migration_flags /* flags */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_1;
+
+typedef cl_int (CL_API_CALL* clGetDeviceAndHostTimer_fn)(
+    cl_device_id /* device */, cl_ulong * /* device_timestamp */,
+    cl_ulong * /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+
+typedef cl_int (CL_API_CALL* clGetHostTimer_fn)(
+    cl_device_id /* device */, cl_ulong * /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+
+typedef cl_int (CL_API_CALL* clSetProgramSpecializationConstant_fn)(
+    cl_program /* program */, cl_uint /* spec_id */, size_t /* spec_size */,
+    const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
+
+typedef cl_int (CL_API_CALL* clSetProgramReleaseCallback_fn)(
+    cl_program /* program */,
+    void (CL_CALLBACK *  /* pfn_notify */)(cl_program program, void * user_data),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_2_2;
+
+typedef struct _cl_icd_dispatch_table {
+  /* OpenCL 1.0 */
+  clGetPlatformIDs_fn GetPlatformIDs;
+  clGetPlatformInfo_fn GetPlatformInfo;
+  clGetDeviceIDs_fn GetDeviceIDs;
+  clGetDeviceInfo_fn GetDeviceInfo;
+  clCreateContext_fn CreateContext;
+  clCreateContextFromType_fn CreateContextFromType;
+  clRetainContext_fn RetainContext;
+  clReleaseContext_fn ReleaseContext;
+  clGetContextInfo_fn GetContextInfo;
+  clCreateCommandQueue_fn CreateCommandQueue;
+  clRetainCommandQueue_fn RetainCommandQueue;
+  clReleaseCommandQueue_fn ReleaseCommandQueue;
+  clGetCommandQueueInfo_fn GetCommandQueueInfo;
+  clSetCommandQueueProperty_fn SetCommandQueueProperty;
+  clCreateBuffer_fn CreateBuffer;
+  clCreateImage2D_fn CreateImage2D;
+  clCreateImage3D_fn CreateImage3D;
+  clRetainMemObject_fn RetainMemObject;
+  clReleaseMemObject_fn ReleaseMemObject;
+  clGetSupportedImageFormats_fn GetSupportedImageFormats;
+  clGetMemObjectInfo_fn GetMemObjectInfo;
+  clGetImageInfo_fn GetImageInfo;
+  clCreateSampler_fn CreateSampler;
+  clRetainSampler_fn RetainSampler;
+  clReleaseSampler_fn ReleaseSampler;
+  clGetSamplerInfo_fn GetSamplerInfo;
+  clCreateProgramWithSource_fn CreateProgramWithSource;
+  clCreateProgramWithBinary_fn CreateProgramWithBinary;
+  clRetainProgram_fn RetainProgram;
+  clReleaseProgram_fn ReleaseProgram;
+  clBuildProgram_fn BuildProgram;
+  clUnloadCompiler_fn UnloadCompiler;
+  clGetProgramInfo_fn GetProgramInfo;
+  clGetProgramBuildInfo_fn GetProgramBuildInfo;
+  clCreateKernel_fn CreateKernel;
+  clCreateKernelsInProgram_fn CreateKernelsInProgram;
+  clRetainKernel_fn RetainKernel;
+  clReleaseKernel_fn ReleaseKernel;
+  clSetKernelArg_fn SetKernelArg;
+  clGetKernelInfo_fn GetKernelInfo;
+  clGetKernelWorkGroupInfo_fn GetKernelWorkGroupInfo;
+  clWaitForEvents_fn WaitForEvents;
+  clGetEventInfo_fn GetEventInfo;
+  clRetainEvent_fn RetainEvent;
+  clReleaseEvent_fn ReleaseEvent;
+  clGetEventProfilingInfo_fn GetEventProfilingInfo;
+  clFlush_fn Flush;
+  clFinish_fn Finish;
+  clEnqueueReadBuffer_fn EnqueueReadBuffer;
+  clEnqueueWriteBuffer_fn EnqueueWriteBuffer;
+  clEnqueueCopyBuffer_fn EnqueueCopyBuffer;
+  clEnqueueReadImage_fn EnqueueReadImage;
+  clEnqueueWriteImage_fn EnqueueWriteImage;
+  clEnqueueCopyImage_fn EnqueueCopyImage;
+  clEnqueueCopyImageToBuffer_fn EnqueueCopyImageToBuffer;
+  clEnqueueCopyBufferToImage_fn EnqueueCopyBufferToImage;
+  clEnqueueMapBuffer_fn EnqueueMapBuffer;
+  clEnqueueMapImage_fn EnqueueMapImage;
+  clEnqueueUnmapMemObject_fn EnqueueUnmapMemObject;
+  clEnqueueNDRangeKernel_fn EnqueueNDRangeKernel;
+  clEnqueueTask_fn EnqueueTask;
+  clEnqueueNativeKernel_fn EnqueueNativeKernel;
+  clEnqueueMarker_fn EnqueueMarker;
+  clEnqueueWaitForEvents_fn EnqueueWaitForEvents;
+  clEnqueueBarrier_fn EnqueueBarrier;
+  clGetExtensionFunctionAddress_fn GetExtensionFunctionAddress;
+  clCreateFromGLBuffer_fn CreateFromGLBuffer;
+  clCreateFromGLTexture2D_fn CreateFromGLTexture2D;
+  clCreateFromGLTexture3D_fn CreateFromGLTexture3D;
+  clCreateFromGLRenderbuffer_fn CreateFromGLRenderbuffer;
+  clGetGLObjectInfo_fn GetGLObjectInfo;
+  clGetGLTextureInfo_fn GetGLTextureInfo;
+  clEnqueueAcquireGLObjects_fn EnqueueAcquireGLObjects;
+  clEnqueueReleaseGLObjects_fn EnqueueReleaseGLObjects;
+  clGetGLContextInfoKHR_fn GetGLContextInfoKHR;
+  void* _reservedForD3D10KHR[6];
+
+  /* OpenCL 1.1 */
+  clSetEventCallback_fn SetEventCallback;
+  clCreateSubBuffer_fn CreateSubBuffer;
+  clSetMemObjectDestructorCallback_fn SetMemObjectDestructorCallback;
+  clCreateUserEvent_fn CreateUserEvent;
+  clSetUserEventStatus_fn SetUserEventStatus;
+  clEnqueueReadBufferRect_fn EnqueueReadBufferRect;
+  clEnqueueWriteBufferRect_fn EnqueueWriteBufferRect;
+  clEnqueueCopyBufferRect_fn EnqueueCopyBufferRect;
+
+  void* _reservedForDeviceFissionEXT[3];
+  clCreateEventFromGLsyncKHR_fn CreateEventFromGLsyncKHR;
+
+  /* OpenCL 1.2 */
+  clCreateSubDevices_fn CreateSubDevices;
+  clRetainDevice_fn RetainDevice;
+  clReleaseDevice_fn ReleaseDevice;
+  clCreateImage_fn CreateImage;
+  clCreateProgramWithBuiltInKernels_fn CreateProgramWithBuiltInKernels;
+  clCompileProgram_fn CompileProgram;
+  clLinkProgram_fn LinkProgram;
+  clUnloadPlatformCompiler_fn UnloadPlatformCompiler;
+  clGetKernelArgInfo_fn GetKernelArgInfo;
+  clEnqueueFillBuffer_fn EnqueueFillBuffer;
+  clEnqueueFillImage_fn EnqueueFillImage;
+  clEnqueueMigrateMemObjects_fn EnqueueMigrateMemObjects;
+  clEnqueueMarkerWithWaitList_fn EnqueueMarkerWithWaitList;
+  clEnqueueBarrierWithWaitList_fn EnqueueBarrierWithWaitList;
+  clGetExtensionFunctionAddressForPlatform_fn GetExtensionFunctionAddressForPlatform;
+  clCreateFromGLTexture_fn CreateFromGLTexture;
+
+  /* cl_khr_d3d11_sharing, cl_khr_dx9_media_sharing */
+  void* _reservedForD3DExtensions[10];
+
+  /* cl_khr_egl_image, cl_khr_egl_event */
+  void* _reservedForEGLExtensions[4];
+
+  /* OpenCL 2.0 */
+  clCreateCommandQueueWithProperties_fn CreateCommandQueueWithProperties;
+  clCreatePipe_fn CreatePipe;
+  clGetPipeInfo_fn GetPipeInfo;
+  clSVMAlloc_fn SVMAlloc;
+  clSVMFree_fn SVMFree;
+  clEnqueueSVMFree_fn EnqueueSVMFree;
+  clEnqueueSVMMemcpy_fn EnqueueSVMMemcpy;
+  clEnqueueSVMMemFill_fn EnqueueSVMMemFill;
+  clEnqueueSVMMap_fn EnqueueSVMMap;
+  clEnqueueSVMUnmap_fn EnqueueSVMUnmap;
+  clCreateSamplerWithProperties_fn CreateSamplerWithProperties;
+  clSetKernelArgSVMPointer_fn SetKernelArgSVMPointer;
+  clSetKernelExecInfo_fn SetKernelExecInfo;
+  /* cl_khr_sub_groups */
+  clGetKernelSubGroupInfoKHR_fn GetKernelSubGroupInfoKHR;
+
+  /* OpenCL 2.1 */
+  clCloneKernel_fn CloneKernel;
+  clCreateProgramWithILKHR_fn CreateProgramWithILKHR;
+  clEnqueueSVMMigrateMem_fn EnqueueSVMMigrateMem;
+  clGetDeviceAndHostTimer_fn  GetDeviceAndHostTimer;
+  clGetHostTimer_fn GetHostTimer;
+  clGetKernelSubGroupInfoKHR_fn GetKernelSubGroupInfo;
+  clSetDefaultDeviceCommandQueue_fn SetDefaultDeviceCommandQueue;
+
+  /* OpenCL 2.2 */
+  clSetProgramReleaseCallback_fn SetProgramReleaseCallback;
+  clSetProgramSpecializationConstant_fn SetProgramSpecializationConstant;
+
+} cl_icd_dispatch_table;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __OPENCL_CL_ICD_H */
diff --git a/amdocl/cl_kernel.h b/amdocl/cl_kernel.h
new file mode 100644
index 0000000000..e0c960d3ea
--- /dev/null
+++ b/amdocl/cl_kernel.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2012-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CL_KERNEL_H_
+#define CL_KERNEL_H_
+
+struct clk_builtins_t;
+
+// This must be a multiple of sizeof(cl_ulong16)
+#define __CPU_SCRATCH_SIZE 128
+
+#define CLK_PRIVATE_MEMORY_SIZE (16 * 1024)
+
+struct clk_thread_info_block_t {
+  // Warning!  The size of this struct needs to be a multiple
+  // of 16 when compiling 64 bit
+
+  struct clk_builtins_t const* builtins;
+  void* local_mem_base;
+  void* local_scratch;
+  const void* table_base;
+  size_t pad;
+
+  uint work_dim;
+  size_t global_offset[4]; /*dim0,dim1,dim2,invalid(dim<0||dim>2)*/
+  size_t global_size[4];   /*dim0,dim1,dim2,invalid(dim<0||dim>2)*/
+
+  size_t enqueued_local_size[4];
+  size_t local_size[4]; /*dim0,dim1,dim2,invalid(dim<0||dim>2)*/
+  size_t local_id[4];   /*dim0,dim1,dim2,invalid(dim<0||dim>2)*/
+  size_t group_id[4];   /*dim0,dim1,dim2,invalid(dim<0||dim>2)*/
+};
+
+typedef enum clk_value_type_t {
+  T_VOID,
+  T_CHAR,
+  T_SHORT,
+  T_INT,
+  T_LONG,
+  T_FLOAT,
+  T_DOUBLE,
+  T_POINTER,
+  T_CHAR2,
+  T_CHAR3,
+  T_CHAR4,
+  T_CHAR8,
+  T_CHAR16,
+  T_SHORT2,
+  T_SHORT3,
+  T_SHORT4,
+  T_SHORT8,
+  T_SHORT16,
+  T_INT2,
+  T_INT3,
+  T_INT4,
+  T_INT8,
+  T_INT16,
+  T_LONG2,
+  T_LONG3,
+  T_LONG4,
+  T_LONG8,
+  T_LONG16,
+  T_FLOAT2,
+  T_FLOAT3,
+  T_FLOAT4,
+  T_FLOAT8,
+  T_FLOAT16,
+  T_DOUBLE2,
+  T_DOUBLE3,
+  T_DOUBLE4,
+  T_DOUBLE8,
+  T_DOUBLE16,
+  T_SAMPLER,
+  T_SEMA,
+  T_STRUCT,
+  T_QUEUE,
+  T_PAD
+} clk_value_type_t;
+
+typedef enum clk_address_space_t {
+  A_PRIVATE,
+  A_LOCAL,
+  A_CONSTANT,
+  A_GLOBAL,
+  A_REGION
+} clk_address_space_t;
+
+// kernel arg access qualifier and type qualifier
+typedef enum clk_arg_qualifier_t {
+  Q_NONE = 0,
+
+  // for image type only, access qualifier
+  Q_READ = 1,
+  Q_WRITE = 2,
+
+  // for pointer type only
+  Q_CONST = 4,  // pointee
+  Q_RESTRICT = 8,
+  Q_VOLATILE = 16,  // pointee
+  Q_PIPE = 32       // pipe
+
+} clk_arg_qualifier_t;
+
+#pragma pack(push, 4)
+struct clk_parameter_descriptor_t {
+  clk_value_type_t type;
+  clk_address_space_t space;
+  uint qualifier;
+  const char* name;
+};
+#pragma pack(pop)
+
+//#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+//#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+struct clk_builtins_t {
+  /* Synchronization functions */
+  void (*barrier_ptr)(cl_mem_fence_flags flags);
+
+  /* AMD Only builtins: FIXME_lmoriche (extension) */
+  void* reserved;
+  int (*printf_ptr)(const char* format, ...);
+};
+
+enum clk_natures_t { KN_HAS_BARRIER = 1 << 0, KN_WG_LEVEL = 1 << 1 };
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4200)
+#endif
+
+#if !defined(__OPENCL_VERSION__) || __OPENCL_VERSION__ >= 200
+
+typedef struct clk_pipe_t {
+  size_t read_idx;
+  size_t write_idx;
+  size_t end_idx;
+  char padding[128 - 3 * sizeof(size_t)];
+  char packets[];
+} clk_pipe_t;
+
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif /*CL_KERNEL_H_*/
diff --git a/amdocl/cl_profile_amd.h b/amdocl/cl_profile_amd.h
new file mode 100644
index 0000000000..7adca946e0
--- /dev/null
+++ b/amdocl/cl_profile_amd.h
@@ -0,0 +1,189 @@
+/* Copyright (c) 2009-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __CL_PROFILE_AMD_H
+#define __CL_PROFILE_AMD_H
+
+#include "CL/cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+typedef struct _cl_perfcounter_amd* cl_perfcounter_amd;
+typedef cl_ulong cl_perfcounter_property;
+typedef cl_uint cl_perfcounter_info;
+
+/* cl_perfcounter_info */
+enum PerfcounterInfo {
+  CL_PERFCOUNTER_NONE = 0x0,
+  CL_PERFCOUNTER_REFERENCE_COUNT = 0x1,
+  CL_PERFCOUNTER_DATA = 0x2,
+  CL_PERFCOUNTER_GPU_BLOCK_INDEX = 0x3,
+  CL_PERFCOUNTER_GPU_COUNTER_INDEX = 0x4,
+  CL_PERFCOUNTER_GPU_EVENT_INDEX = 0x5,
+  CL_PERFCOUNTER_LAST
+};
+
+/*********************************
+* Set device clock mode data
+*********************************/
+enum cl_DeviceClockMode_AMD {
+  CL_DEVICE_CLOCK_MODE_DEFAULT_AMD = 0x0, /*Device clocks and other power settings are restored to default*/
+  CL_DEVICE_CLOCK_MODE_QUERY_AMD = 0x1, /*Queries the current device clock ratios. Leaves the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_PROFILING_AMD = 0x2, /*Scale down from peak ratio*/
+  CL_DEVICE_CLOCK_MODE_MINIMUMMEMORY_AMD = 0x3, /* Memory clock is set to the lowest available level*/
+  CL_DEVICE_CLOCK_MODE_MINIMUMENGINE_AMD = 0x4, /*Engine clock is set to the lowest available level*/
+  CL_DEVICE_CLOCK_MODE_PEAK_AMD = 0x5, /*Clocks set to maximum when possible. Fan set to maximum.*/
+  CL_DEVICE_CLOCK_MODE_QUERYPROFILING_AMD = 0x6, /*Queries the profiling device clock ratios. Leaves the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_QUERYPEAK_AMD = 0x7, /*Queries the peak device clock ratios.Leaves the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_COUNT_AMD = 0x8, /*Maxmium count of device clock mode*/
+};
+
+typedef struct _cl_set_device_clock_mode_input_amd
+{
+  /* specify the clock mode for AMD GPU device*/
+  cl_DeviceClockMode_AMD clock_mode;
+} cl_set_device_clock_mode_input_amd;
+
+typedef struct _cl_set_device_clock_mode_output_amd
+{
+  /*Ratio of current mem clock to peak clock as obtained from DeviceProperties::maxGpuClock*/
+  cl_float memory_clock_ratio_to_peak;
+  /*Ratio of current gpu core clock to peak clock as obtained from DeviceProperties::maxGpuClock*/
+  cl_float engine_clock_ratio_to_peak;
+} cl_set_device_clock_mode_output_amd;
+
+/*! \brief Creates a new HW performance counter
+ *   for the specified OpenCL context.
+ *
+ *  \param device must be a valid OpenCL device.
+ *
+ *  \param properties the list of properties of the hardware counter
+ *
+ *  \param errcode_ret  A non zero value if OpenCL failed to create PerfCounter
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_CONTEXT if the specified context is invalid.
+ *  - CL_OUT_OF_RESOURCES if we couldn't create the object
+ *
+ *  \return the created perfcounter object
+ */
+extern CL_API_ENTRY cl_perfcounter_amd CL_API_CALL clCreatePerfCounterAMD(
+    cl_device_id /* device */, cl_perfcounter_property* /* properties */, cl_int* /* errcode_ret */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Destroy a performance counter object.
+ *
+ *  \param perf_counter the perfcounter object for release
+ *
+ *  \return A non zero value if OpenCL failed to release PerfCounter
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_OPERATION if we failed to release the object
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clReleasePerfCounterAMD(cl_perfcounter_amd /* perf_counter */
+                                                               ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Increments the perfcounter object reference count.
+ *
+ *  \param perf_counter the perfcounter object for retain
+ *
+ *  \return A non zero value if OpenCL failed to retain PerfCounter
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_OPERATION if we failed to release the object
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clRetainPerfCounterAMD(cl_perfcounter_amd /* perf_counter */
+                                                              ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Enqueues the begin command for the specified counters.
+ *
+ *  \param command_queue must be a valid OpenCL command queue.
+ *
+ *  \param num_perf_counters the number of perfcounter objects in the array.
+ *
+ *  \param perf_counters specifies an array of perfcounter objects.
+ *
+ *  \return A non zero value if OpenCL failed to release PerfCounter
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_OPERATION if we failed to enqueue the begin operation
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueBeginPerfCounterAMD(
+    cl_command_queue /* command_queue */, cl_uint /* num_perf_counters */,
+    cl_perfcounter_amd* /* perf_counters */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Enqueues the end command for the specified counters.
+ *
+ *  \param command_queue must be a valid OpenCL command queue.
+ *
+ *  \param num_perf_counters the number of perfcounter objects in the array.
+ *
+ *  \param perf_counters specifies an array of perfcounter objects.
+ *
+ *  \param event the event object associated with the end operation.
+ *
+ *  \return A non zero value if OpenCL failed to release PerfCounter
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_OPERATION if we failed to enqueue the end operation
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueEndPerfCounterAMD(
+    cl_command_queue /* command_queue */, cl_uint /* num_perf_counters */,
+    cl_perfcounter_amd* /* perf_counters */, cl_uint /* num_events_in_wait_list */,
+    const cl_event* /* event_wait_list */, cl_event* /* event */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Retrieves the results from the counter objects.
+ *
+ *  \param perf_counter specifies a perfcounter objects for query.
+ *
+ *  \param param_name specifies the information to query.
+ *
+ *  \param param_value is a pointer to memory where the appropriate result
+ *  being queried is returned. If \a param_value is NULL, it is ignored.
+ *
+ *  \param param_value_size is used to specify the size in bytes of memory
+ *  pointed to by \a param_value. This size must be >= size of return type.
+ *
+ *  \param param_value_size_ret returns the actual size in bytes of data copied
+ *  to \a param_value. If \a param_value_size_ret is NULL, it is ignored.
+ *
+ *  \param values must be a valid pointer to an array of 64-bit values
+ *  and the array size must be equal to num_perf_counters.
+ *
+ *  \return
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_PROFILING_INFO_NOT_AVAILABLE if event isn't finished.
+ *  - CL_INVALID_OPERATION if we failed to get the data
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetPerfCounterInfoAMD(
+    cl_perfcounter_amd /* perf_counter */, cl_perfcounter_info /* param_name */,
+    size_t /* param_value_size */, void* /* param_value */, size_t* /* param_value_size_ret */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clSetDeviceClockModeAMD(
+    cl_device_id /* device*/, cl_set_device_clock_mode_input_amd /* Clock_Mode_Input */,
+    cl_set_device_clock_mode_output_amd* /* Clock_Mode_Output */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif /*__cplusplus*/
+
+#endif /*__CL_PROFILE_AMD_H*/
diff --git a/amdocl/cl_thread_trace_amd.h b/amdocl/cl_thread_trace_amd.h
new file mode 100644
index 0000000000..fe9aed6f34
--- /dev/null
+++ b/amdocl/cl_thread_trace_amd.h
@@ -0,0 +1,363 @@
+/* Copyright (c) 2012-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __CL_THREAD_TRACE_AMD_H
+#define __CL_THREAD_TRACE_AMD_H
+
+#include "CL/cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+typedef struct _cl_threadtrace_amd* cl_threadtrace_amd;
+typedef cl_uint cl_thread_trace_param;
+typedef cl_uint cl_threadtrace_info;
+
+/* cl_command_type */
+#define CL_COMMAND_THREAD_TRACE_MEM 0x4500
+#define CL_COMMAND_THREAD_TRACE 0x4501
+
+/* cl_threadtrace_command_name_amd enumeration */
+typedef enum _cl_threadtrace_command_name_amd {
+  CL_THREAD_TRACE_BEGIN_COMMAND,
+  CL_THREAD_TRACE_END_COMMAND,
+  CL_THREAD_TRACE_PAUSE_COMMAND,
+  CL_THREAD_TRACE_RESUME_COMMAND
+} cl_threadtrace_command_name_amd;
+
+// Thread trace parameters
+enum ThreadTraceParameter {
+  CL_THREAD_TRACE_PARAM_TOKEN_MASK,
+  CL_THREAD_TRACE_PARAM_REG_MASK,
+  CL_THREAD_TRACE_PARAM_COMPUTE_UNIT_TARGET,
+  CL_THREAD_TRACE_PARAM_SHADER_ARRAY_TARGET,
+  CL_THREAD_TRACE_PARAM_SIMD_MASK,
+  CL_THREAD_TRACE_PARAM_VM_ID_MASK,
+  CL_THREAD_TRACE_PARAM_RANDOM_SEED,
+  CL_THREAD_TRACE_PARAM_CAPTURE_MODE,
+  CL_THREAD_TRACE_PARAM_INSTRUCTION_MASK,
+  CL_THREAD_TRACE_PARAM_USER_DATA,
+  CL_THREAD_TRACE_PARAM_IS_WRAPPED
+};
+
+// CL_THREAD_TRACE_PARAM_TOKEN_MASK data selects for SI
+enum CL_THREAD_TRACE_TOKEN_MASK {
+  // Time passed
+  CL_THREAD_TRACE_TOKEN_MASK_TIME_SI = 0x00000001,
+  // Resync the timestamp
+  CL_THREAD_TRACE_TOKEN_MASK_TIMESTAMP_SI = 0x00000002,
+  // A register write has occurred
+  CL_THREAD_TRACE_TOKEN_MASK_REG_SI = 0x00000004,
+  // A wavefront has started
+  CL_THREAD_TRACE_TOKEN_MASK_WAVE_START_SI = 0x00000008,
+  // Output space has been allocated for color/Z [Should be used for cl-gl]
+  CL_THREAD_TRACE_TOKEN_MASK_WAVE_PS_ALLOC_SI = 0x00000010,
+  // Output space has been allocated for vertex position [Should be used for cl-gl]
+  CL_THREAD_TRACE_TOKEN_MASK_WAVE_VS_ALLOC_SI = 0x00000020,
+  // Wavefront completion
+  CL_THREAD_TRACE_TOKEN_MASK_WAVE_END_SI = 0x00000040,
+  // An event has reached the top of a shader stage. In-order with WAVE_START
+  CL_THREAD_TRACE_TOKEN_MASK_EVENT_SI = 0x00000080,
+  // An event has reached the top of a compute shader stage. In-order with WAVE_START
+  CL_THREAD_TRACE_TOKEN_MASK_EVENT_CS_SI = 0x00000100,
+  // An event has reached the top of a shader stage for the second GFX pipe. In-order with
+  // WAVE_START.
+  //[Should be used for cl-gl]
+  CL_THREAD_TRACE_TOKEN_MASK_EVENT_GFX_SI = 0x00000200,
+  // The kernel has executed an instruction
+  CL_THREAD_TRACE_TOKEN_MASK_INST_SI = 0x00000400,
+  // The kernel has explicitly written the PC value
+  CL_THREAD_TRACE_TOKEN_MASK_INST_PC_SI = 0x00000800,
+  // The kernel has written user data into the thread trace buffer
+  CL_THREAD_TRACE_TOKEN_MASK_INST_USERDATA_SI = 0x00001000,
+  // Provides information about instruction scheduling
+  CL_THREAD_TRACE_TOKEN_MASK_ISSUE_SI = 0x00002000,
+  // The performance counter delta has been updated
+  CL_THREAD_TRACE_TOKEN_MASK_PERF_SI = 0x00004000,
+  // A miscellaneous event has been sent
+  CL_THREAD_TRACE_TOKEN_MASK_MISC_SI = 0x00008000,
+  // All possible tokens
+  CL_THREAD_TRACE_TOKEN_MASK_ALL_SI = 0x0000ffff,
+};
+
+// CL_THREAD_TRACE_PARAM_REG_MASK data selects
+enum CL_THREAD_TRACE_REG_MASK {
+  // Event initiator
+  CL_THREAD_TRACE_REG_MASK_EVENT_SI = 0x00000001,
+  // Draw initiator [Should be used for cl-gl]
+  CL_THREAD_TRACE_REG_MASK_DRAW_SI = 0x00000002,
+  // Dispatch initiator
+  CL_THREAD_TRACE_REG_MASK_DISPATCH_SI = 0x00000004,
+  // User data from host
+  CL_THREAD_TRACE_REG_MASK_USERDATA_SI = 0x00000008,
+  // GFXDEC register (8-state) [Should be used for cl-gl]
+  CL_THREAD_TRACE_REG_MASK_GFXDEC_SI = 0x00000020,
+  // SHDEC register (many state)
+  CL_THREAD_TRACE_REG_MASK_SHDEC_SI = 0x00000040,
+  // Other registers
+  CL_THREAD_TRACE_REG_MASK_OTHER_SI = 0x00000080,
+  // All possible registers types
+  CL_THREAD_TRACE_REG_MASK_ALL_SI = 0x000000ff,
+};
+
+// CL_THREAD_TRACE_PARAM_VM_ID_MASK data selects
+enum CL_THREAD_TRACE_VM_ID_MASK {
+  // Capture only data from the VM_ID used to write {SQTT}_BASE
+  CL_THREAD_TRACE_VM_ID_MASK_SINGLE = 0,
+  // Capture all data from all VM_IDs
+  CL_THREAD_TRACE_VM_ID_MASK_ALL = 1,
+  // Capture all data but only get target (a.k.a. detail) data from VM_ID used to write {SQTT}_BASE
+  CL_THREAD_TRACE_VM_ID_MASK_SINGLE_DETAIL = 2
+};
+
+// CL_THREAD_TRACE_PARAM_CAPTURE_MODE data
+enum CL_THREAD_TRACE_CAPTURE_MODE {
+  // Capture all data in the thread trace buffer
+  CL_THREAD_TRACE_CAPTURE_ALL = 0,
+  // Capture only data between THREAD_TRACE_START and THREAD_TRACE_STOP events
+  CL_THREAD_TRACE_CAPTURE_SELECT = 1,
+  // Capture data between THREAD_TRACE_START and THREAD_TRACE_/STOP events,
+  // and global/reference data at all times
+  CL_THREAD_TRACE_CAPTURE_SELECT_DETAIL = 2
+};
+
+// CL_THREAD_TRACE_PARAM_INSTRUCTION_MASK data selects
+enum CL_THREAD_TRACE_INSTRUCTION_MASK {
+  // Generate {SQTT}_TOKEN_INST tokens for all instructions
+  CL_THREAD_TRACE_INST_MASK_ALL,
+  // Generate {SQTT}_TOKEN_INST tokens for stalled instructions only
+  CL_THREAD_TRACE_INST_MASK_STALLED,
+  // Generate {SQTT}_TOKEN_INST messages for stalled and other (no op/wait/set prio/etc)
+  // instructions
+  CL_THREAD_TRACE_INST_MASK_STALLED_AND_IMMEDIATE,
+  // Generate {SQTT}_TOKEN_INST messages for immediate instructions only only [ Should be used only
+  // for CI]
+  CL_THREAD_TRACE_INST_MASK_IMMEDIATE_CI,
+};
+
+enum ThreadTraceInfo {
+  CL_THREAD_TRACE_SE,
+  CL_THREAD_TRACE_BUFFERS_FILLED,
+  CL_THREAD_TRACE_BUFFERS_SIZE
+};
+
+
+/*! \brief Creates a new cl_threadtrace_amd object
+ *
+ *  \param device must be a valid OpenCL device.
+ *
+ *  \param errcode_ret  A non zero value if OpenCL failed to create threadTrace
+ *  -CL_INVALID_DEVICE if devices contains an invalid device.
+ *  -CL_DEVICE_NOT_AVAILABLE if a device is currently not available even
+ *                            though the device was returned by clGetDeviceIDs.
+ *  -CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the
+ *                       OpenCL  implementation on the device.
+ *  -CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+                          OpenCL implementation on the host.
+ *
+ *  \return the created threadTrace object
+ */
+extern CL_API_ENTRY cl_threadtrace_amd CL_API_CALL clCreateThreadTraceAMD(
+    cl_device_id /* device */, cl_int* /* errcode_ret */
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Destroys a cl_threadtrace_amd object.
+ *
+ *  \param threadTrace the cl_threadtrace_amd object for release
+ *
+ *  \return A non zero value if OpenCL failed to release threadTrace
+ *  -CL_INVALID_VALUE if the thread_trace is not a valid  OpenCL thread trace object
+ (cl_threadtrace_amd) .
+ *  -CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the
+ *                       OpenCL  implementation on the device.
+ *  -CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+                      OpenCL  implementation on the host.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clReleaseThreadTraceAMD(cl_threadtrace_amd /* threadTrace */
+                                                               ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Increments the cl_threadtrace_amd object reference count.
+ *
+ *  \param threadTrace the cl_threadtrace_amd object for retain
+ *
+ *  \return A non zero value if OpenCL failed to retain threadTrace
+ *  -CL_INVALID_VALUE if the thread_trace is not a valid thread trace object (cl_threadtrace_amd) .
+ *  -CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the
+                         OpenCL implementation on the device.
+ *  -CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+                           OpenCL implementation on the host.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clRetainThreadTraceAMD(cl_threadtrace_amd /* threadTrace */
+                                                              ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Sets the cl_threadtrace_amd object configuration parameter.
+ *
+ *  \param thread_trace the cl_threadtrace_amd object to set configuration parameter
+ *
+ *  \param config_param the cl_thread_trace_param
+ *
+ *  \param param_value corresponding to configParam
+ *
+ *  \return A non zero value if OpenCL failed to set threadTrace buffer parameter
+ *  - CL_INVALID_VALUE if the thread_trace  is invalid thread trace object.
+ *  - CL_INVALID_VALUE if the invalid config_param or param_value enum values , are used.
+ *  - CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and num_events_in_wait_list > 0, or
+ event_wait_list is not NULL and num_events_in_wait_list is 0,
+ *  -                            or if event objects in event_wait_list are not valid events.
+ *  - CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL
+ implementation on the device.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+                           OpenCL implementation on the host.
+ */
+
+extern CL_API_ENTRY cl_int CL_API_CALL clSetThreadTraceParamAMD(
+    cl_threadtrace_amd /*thread_trace*/, cl_thread_trace_param /*config_param*/,
+    cl_uint /*param_value*/
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/* \brief Enqueues the binding command to bind cl_threadtrace_amd to cl_mem object for trace
+ * recording..
+ *
+ *  \param command_queue must be a valid OpenCL command queue.
+ *
+ *  \param thread_trace specifies the cl_threadtrace_amd object.
+ *
+ *  \param mem_objects the cl_mem objects for trace recording
+ *
+ *  \param mem_objects_num the number of cl_mem objects in the mem_objects
+ *
+ *  \param buffer_size the size of each cl_mem object from mem_objects
+ *
+ *  \param event_wait_list specify [is a pointer to] events that need to
+ *  complete before this particular command can be executed.
+ *  If \a event_wait_list is NULL, then this particular command does not wait
+ *  on any event to complete. If \a event_wait_list is NULL,
+ *  \a num_events_in_wait_list must be 0. If \a event_wait_list is not NULL,
+ *  the list of events pointed to by \a event_wait_list must be valid and
+ *  \a num_events_in_wait_list must be greater than 0. The events specified in
+ *  \a event_wait_list act as synchronization points.
+ *
+ *  \param num_events_in_wait_list specify the number of events in
+ *  \a event_wait_list. It must be 0 if \a event_wait_list is NULL. It  must be
+ *  greater than 0 if \a event_wait_list is not NULL.
+ *
+ *  \param event returns an event object that identifies this particular
+ *  command and can be used to query or queue a wait for this particular
+ *  command to complete. \a event can be NULL in which case it will not be
+ *  possible for the application to query the status of this command or queue a
+ *  wait for this command to complete.
+ *  \return A non zero value if OpenCL failed to set threadTrace buffer parameter
+ *  - CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue.
+ *  - CL_INVALID_CONTEXT if the context associated with command_queue and  events in event_wait_list
+ * are not the same.
+ *  - CL_INVALID_VALUE if the thread_trace  is invalid thread trace object.
+ *  - CL_INVALID_VALUE if the buffer_size is negative or zero.
+ *  - CL_INVALID_VALUE if the  sub_buffers_num I less than 1.
+ *  - CL_INVALID_OPERATION if the mem_objects_num is not equal to the number of Shader Engines of
+ * the [GPU] device.
+ *  - CL_INVALID_MEM_OBJECT if one on memory objects in the mem_objects array is not a valid memory
+ * object or memory_objects is NULL.
+ *  - CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for the data store
+ * associated from the memory objects of the mem_objects array.
+ *  - CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and num_events_in_wait_list > 0, or
+ * event_wait_list is not NULL and num_events_in_wait_list is 0, or if event objects in
+ * event_wait_list are not valid events.
+ *  - CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL
+ * implementation on the device.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+ *     OpenCL implementation on the host.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueBindThreadTraceBufferAMD(
+    cl_command_queue command_queue, cl_threadtrace_amd /*thread_trace*/, cl_mem* /*mem_objects*/,
+    cl_uint /*mem_objects_num*/, cl_uint /*buffer_size*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event* /*event_wait_list*/, cl_event* /*event*/
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Get specific information about the OpenCL Thread Trace.
+ *
+ *  \param thread_trace_info_param is an enum that identifies the Thread Trace information being
+ *  queried.
+ *
+ *  \param param_value is a pointer to memory location where appropriate values
+ *  for a given \a threadTrace_info_param will be returned. If \a param_value is NULL,
+ *  it is ignored.
+ *
+ *  \param param_value_size specifies the size in bytes of memory pointed to by
+ *  \a param_value. This size in bytes must be >= size of return type.
+ *
+ *  \param param_value_size_ret returns the actual size in bytes of data being
+ *  queried by param_value. If \a param_value_size_ret is NULL, it is ignored.
+ *
+ *  \return One of the following values:
+ *      CL_INVALID_OPERATION if cl_threadtrace_amd object is not valid
+ *    - CL_INVALID_VALUE if \a param_name is not one of the supported
+ *      values or if size in bytes specified by \a param_value_size is < size of
+ *      return type and \a param_value is not a NULL value.
+ *      CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the
+ *      OpenCL implementation on the host.
+ *      CL_SUCCESS if the function is executed successfully.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetThreadTraceInfoAMD(
+    cl_threadtrace_amd /* thread_trace */, cl_threadtrace_info /*thread_trace_info_param*/,
+    size_t /*param_value_size*/, void* /*param_value*/, size_t* /*param_value_size_ret*/
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+/*! \brief Enqueues the thread trace command for the specified thread trace object.
+ *
+ *  \param command_queue must be a valid OpenCL command queue.
+ *
+ *  \param threadTraces specifies an array of cl_threadtrace_amd objects.
+ *
+ *  \return A non zero value if OpenCL failed to release threadTrace
+ *  - CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue.
+ *  - CL_INVALID_CONTEXT if the context associated with command_queue and  events in event_wait_list
+ * are not the same.
+ *  - CL_INVALID_VALUE if the thread_trace is invalid thread trace object .
+ *  - CL_INVALID_VALUE if the invalid command name enum value , not  described in the
+ * cl_threadtrace_command_name_amd, is used.
+ *  - CL_INVALID_OPERATION if the command enqueue failed. It can happen in the following cases:
+ *          o BEGIN_COMMAND is queued for thread trace object for which memory object/s was/were not
+ * bound..
+ *          o END_COMMAND is queued for thread trace object, for which BEGIN_COMMAND was not queued.
+ *          o PAUSE_COMMAND is queued for thread trace object, for which BEGIN_COMMAND was not
+ * queued.
+ *          o RESUME_COMMAND is queued for thread trace object, for which  PAUSE_COMMAND was not
+ * queued.
+ *  - CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and num_events_in_wait_list > 0, or
+ * event_wait_list is not NULL and num_events_in_wait_list is 0, or if event objects in
+ * event_wait_list are not valid events.
+ *  - CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL
+ * implementation on the device.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL
+ * implementation on the host.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueThreadTraceCommandAMD(
+    cl_command_queue /*command_queue*/, cl_threadtrace_amd /*thread_trace*/,
+    cl_threadtrace_command_name_amd /*command_name*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event* /*event_wait_list*/, cl_event* /*event*/
+    ) CL_API_SUFFIX__VERSION_1_0;
+
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif /*__cplusplus*/
+
+#endif /*__CL_THREAD_TRACE_AMD_H*/
diff --git a/amdocl/gl_functions.hpp b/amdocl/gl_functions.hpp
new file mode 100644
index 0000000000..2d184bc2e6
--- /dev/null
+++ b/amdocl/gl_functions.hpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+GLPREFIX(GLubyte*, glGetString, (GLenum name))
+
+GLPREFIX(void, glBindBuffer, (GLenum target, GLuint buffer))
+//GLPREFIX(void, glBindFramebufferEXT, (GLenum target, GLuint framebuffer))
+GLPREFIX(void, glBindRenderbuffer, (GLenum target, GLuint renderbuffer))
+GLPREFIX(void, glBindTexture, (GLenum target, GLuint texture))
+GLPREFIX(void, glBufferData, (GLenum target, GLsizeiptr size, const GLvoid* data, GLenum usage))
+
+GLPREFIX(GLenum, glCheckFramebufferStatusEXT, (GLenum target))
+
+GLPREFIX(void, glDeleteBuffers, (GLsizei n, const GLuint* buffers))
+GLPREFIX(void, glDrawPixels, (GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels))
+
+//GLPREFIX(void, glFramebufferRenderbufferEXT, (GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer))
+
+GLPREFIX(void, glGenBuffers, (GLsizei n, GLuint* buffers))
+//GLPREFIX(void, glGenFramebuffersEXT, (GLsizei n, GLuint* framebuffers))
+//10
+GLPREFIX(void, glGetBufferParameteriv, (GLenum target, GLenum pname, GLint* params))
+GLPREFIX(GLenum, glGetError, (void))
+GLPREFIX(void, glFinish, (void))
+GLPREFIX(void, glFlush, (void))
+GLPREFIX(GLenum, glClientWaitSync, (GLsync sync, GLbitfield flags, GLuint64 timeout))
+GLPREFIX(void, glGetIntegerv, (GLenum pname, GLint *params))
+GLPREFIX(void, glGetRenderbufferParameterivEXT, (GLenum target, GLenum pname, GLint* params))
+//GLPREFIX(GLubyte*, glGetString, (GLenum name))
+GLPREFIX(void, glGetTexImage, (GLenum target, GLint level, GLenum format, GLenum type, GLvoid *pixels))
+GLPREFIX(void, glGetTexLevelParameteriv, (GLenum target, GLint level, GLenum pname, GLint *params))
+GLPREFIX(void, glGetTexParameteriv, (GLenum target, GLenum pname, GLint *params))
+
+GLPREFIX(GLboolean, glIsBuffer, (GLuint buffer))
+GLPREFIX(GLboolean, glIsRenderbufferEXT, (GLuint renderbuffer))
+GLPREFIX(GLboolean, glIsTexture, (GLuint texture))
+//20
+GLPREFIX(GLvoid*, glMapBuffer, (GLenum target, GLenum access))
+
+GLPREFIX(void, glReadPixels, (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels))
+
+GLPREFIX(void, glTexImage2D, (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels))
+GLPREFIX(void, glTexImage3D, (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels))
+
+GLPREFIX(GLboolean, glUnmapBuffer, (GLenum target))
+
+#undef GLPREFIX
diff --git a/amdocl/icd/loader/icd_dispatch.h b/amdocl/icd/loader/icd_dispatch.h
new file mode 100644
index 0000000000..84a3e305a7
--- /dev/null
+++ b/amdocl/icd/loader/icd_dispatch.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016-2019 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * OpenCL is a trademark of Apple Inc. used under license by Khronos.
+ */
+
+#ifndef _ICD_DISPATCH_H_
+#define _ICD_DISPATCH_H_
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+
+#ifndef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+
+// cl.h
+#include <CL/cl.h>
+
+// cl_gl.h and required files
+#ifdef _WIN32
+#include <windows.h>
+#include <d3d9.h>
+#include <d3d10_1.h>
+#include <CL/cl_d3d10.h>
+#include <CL/cl_d3d11.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_egl.h>
+#include <CL/cl_icd.h>
+
+/*
+ *
+ * vendor dispatch table structure
+ *
+ */
+
+struct _cl_platform_id
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_device_id
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_context
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_command_queue
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_mem
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_program
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_kernel
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_event
+{
+    cl_icd_dispatch *dispatch;
+};
+
+struct _cl_sampler
+{
+    cl_icd_dispatch *dispatch;
+};
+
+#endif // _ICD_DISPATCH_H_
+
diff --git a/bin/hip_embed_pch.sh b/bin/hip_embed_pch.sh
index b8ececf36c..6ec5568809 100755
--- a/bin/hip_embed_pch.sh
+++ b/bin/hip_embed_pch.sh
@@ -21,7 +21,7 @@
 
 printUsage() {
   echo
-  echo "Usage: $(basename "$0") HIP_BUILD_INC_DIR HIP_INC_DIR HIP_AMD_INC_DIR LLVM_DIR [option] [RTC_LIB_OUTPUT]"
+  echo "Usage: $(basename "$0") HIP_BUILD_INC_DIR HIP_INC_DIR LLVM_DIR HSA_DIR [option] [RTC_LIB_OUTPUT]"
   echo
   echo "Options:"
   echo "  -p,  --generate_pch  Generate pre-compiled header (default)"
@@ -39,8 +39,8 @@ fi
 
 HIP_BUILD_INC_DIR="$1"
 HIP_INC_DIR="$2"
-HIP_AMD_INC_DIR="$3"
-LLVM_DIR="$4"
+LLVM_DIR="$3"
+HSA_DIR="$4"
 # By default, generate pch
 TARGET="generatepch"
 
@@ -54,12 +54,12 @@ do
     -r | --generate_rtc )
         TARGET="generatertc" ; break ;;
     *)
-        echo " UNEXPECTED ERROR Parm : [$4] ">&2 ; exit 20 ;;
+        echo " UNEXPECTED ERROR Parm : [$5] ">&2 ; exit 20 ;;
   esac
   shift 1
 done
 
-# Allow hiprtc lib name to be set by argument 7
+# Allow hiprtc lib name to be set by argument 6
 if [[ "$6" != "" ]]; then
   rtc_shared_lib_out="$6"
 else
@@ -78,9 +78,11 @@ else
   tmpdir=/tmp
 fi
 
-# Expected first argument $1 to be output file name.
-create_hip_macro_file() {
-cat >$1 <<EOF
+generate_pch() {
+  tmp=$tmpdir/hip_pch.$$
+  mkdir -p $tmp
+
+cat >$tmp/hip_macros.h <<EOF
 #define __device__ __attribute__((device))
 #define __host__ __attribute__((host))
 #define __global__ __attribute__((global))
@@ -97,13 +99,6 @@ cat >$1 <<EOF
     select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
 
 EOF
-}
-
-generate_pch() {
-  tmp=$tmpdir/hip_pch.$$
-  mkdir -p $tmp
-
-  create_hip_macro_file $tmp/hip_macros.h
 
 cat >$tmp/hip_pch.h <<EOF
 #include "hip/hip_runtime.h"
@@ -125,7 +120,7 @@ EOF
 
   set -x
 
-  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui &&
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HSA_DIR/include --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui &&
 
   cat $tmp/hip_macros.h >> $tmp/pch.cui &&
 
@@ -139,29 +134,17 @@ EOF
 generate_rtc_header() {
   tmp=$tmpdir/hip_rtc.$$
   mkdir -p $tmp
-  local macroFile="$tmp/hip_macros.h"
   local headerFile="$tmp/hipRTC_header.h"
   local mcinFile="$tmp/hipRTC_header.mcin"
 
-  create_hip_macro_file $macroFile
-
 cat >$headerFile <<EOF
-#pragma push_macro("CHAR_BIT")
-#pragma push_macro("INT_MAX")
-#define CHAR_BIT __CHAR_BIT__
-#define INT_MAX __INTMAX_MAX__
-
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
-
-#pragma pop_macro("CHAR_BIT")
-#pragma pop_macro("INT_MAX")
 EOF
 
   echo "// Automatically generated script for HIP RTC." > $mcinFile
   if [[ $isWindows -eq 0 ]]; then
     echo "  .type __hipRTC_header,@object" >> $mcinFile
-    echo "  .type __hipRTC_header_size,@object" >> $mcinFile
   fi
 cat >>$mcinFile <<EOF
   .section .hipRTC_header,"a"
@@ -175,11 +158,9 @@ __hipRTC_header_size:
 EOF
 
   set -x
-  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
-  cat $macroFile >> $tmp/hiprtc &&
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
   $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
   $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
-  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib -nogpuinc -emit-llvm -c -o $tmp/tmp.bc --cuda-device-only -D__HIPCC_RTC__ --offload-arch=gfx906 -x hip-cpp-output $tmp/hiprtc &&
   rm -rf $tmp
 }
 
diff --git a/bin/hipcc b/bin/hipcc
index 8915ce2289..6cea1e36cf 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -296,7 +296,7 @@ foreach $arg (@ARGV)
 
     if ($skipOutputFile) {
 	# TODO: handle filename with shell metacharacters
-        $toolArgs .= " \"$arg\"";
+        $toolArgs .= " $arg";
         $prevArg = $arg;
         $skipOutputFile = 0;
         next;
@@ -376,15 +376,6 @@ foreach $arg (@ARGV)
         $swallowArg = 1;
     }
 
-    # nvcc does not handle standard compiler options properly
-    # This can prevent hipcc being used as standard CXX/C Compiler
-    # To fix this we need to pass -Xcompiler for options
-    if (($arg eq '-fPIC' or $arg =~ '-Wl,') and $HIP_COMPILER eq 'nvcc')
-    {
-        $HIPCXXFLAGS .= " -Xcompiler ".$arg;
-        $swallowArg = 1;
-    }
-
     ## process linker response file for hip-clang
     ## extract object files from static library and pass them directly to
     ## hip-clang in command line.
@@ -428,7 +419,7 @@ foreach $arg (@ARGV)
                 } elsif ($realObjs) {
                     my($libBaseName, $libDir, $libExt) = fileparse($libFile);
                     $libBaseName = mktemp($libBaseName . "XXXX") . $libExt;
-                    system("cd $tmpdir; ar rc $libBaseName $realObjs");
+                    system("cd $tmpdir; ar c $libBaseName $realObjs");
                     print $out "$tmpdir/$libBaseName\n";
                 }
             } elsif ($line =~ m/\.o$/) {
@@ -490,7 +481,7 @@ foreach $arg (@ARGV)
         } elsif ($realObjs) {
             my($libBaseName, $libDir, $libExt) = fileparse($libFile);
             $libBaseName = mktemp($libBaseName . "XXXX") . $libExt;
-            system("cd $tmpdir; ar rc $libBaseName $realObjs");
+            system("cd $tmpdir; ar c $libBaseName $realObjs");
             $new_arg .= " $tmpdir/$libBaseName";
         }
         $arg = "$new_arg";
@@ -552,7 +543,7 @@ foreach $arg (@ARGV)
                 $hasC = 1;
                 $needCFLAGS = 1;
                 $toolArgs .= " -x c";
-            } elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) or ($arg =~ /\.C$/)) {
+            } elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
                 $needCXXFLAGS = 1;
                 if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_PLATFORM ne "amd") {
                     $hasCXX = 1;
@@ -702,9 +693,9 @@ if ($HIP_PLATFORM eq "amd") {
 
     if (not $isWindows  and not $compileOnly) {
       if ($linkType eq 0) {
-        $toolArgs = " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma " . ${toolArgs};
+        $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma ";
       } else {
-        $toolArgs = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 " . ${toolArgs};
+        $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 ";
       }
       # To support __fp16 and _Float16, explicitly link with compiler-rt
       $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 "
diff --git a/bin/hipcc.bat b/bin/hipcc.bat
deleted file mode 100644
index 104e78622d..0000000000
--- a/bin/hipcc.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@IF DEFINED HIP_PATH (set HIPCC="%HIP_PATH%/bin/hipcc") ELSE (set HIPCC="%CD%/hipcc")
-@perl %HIPCC% %*
diff --git a/bin/hipconfig.bat b/bin/hipconfig.bat
deleted file mode 100644
index 64db66aece..0000000000
--- a/bin/hipconfig.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@IF DEFINED HIP_PATH (set HIPCONFIG="%HIP_PATH%/bin/hipconfig") ELSE (set HIPCONFIG="%CD%/hipconfig")
-@perl %HIPCONFIG% %*
diff --git a/bin/roc-obj b/bin/roc-obj
deleted file mode 100755
index 8f93b611a9..0000000000
--- a/bin/roc-obj
+++ /dev/null
@@ -1,264 +0,0 @@
-#!/bin/bash
-
-#| Usage: roc-obj [-h] [-t REGEXP] [-o OUTDIR] [-I REPLACE-STRING|-i] [-d]
-#|                EXECUTABLE... [: [SUFFIX COMMAND [ARGS...] ;]...]
-#|
-#| Wrapper for roc-obj-ls and roc-obj-extract which extracts code objects
-#| embedded in each EXECUTABLE and optionally applies COMMANDs to them.
-#|
-#| If the POSIX extended regular expression REGEXP is specified, only embedded
-#| code objects whose Target ID matches REGEXP are extracted; otherwise all
-#| code objects are extracted.
-#|
-#| If the directory path OUTDIR is specified, it is created if it does not
-#| already exist, and the code objects are extracted into it; otherwise they
-#| are extracted into the current working directory.
-#|
-#| The extracted files are named by appending a ":" followed by the Target ID
-#| of the extracted code object to the input filename EXECUTABLE they were
-#| extracted from.
-#|
-#| If the list of EXECUTABLE arguments is terminated with ":" then after all
-#| selected files are successfully extracted, zero or more additional embedded
-#| command-lines, separated by ";", are read from the command-line starting
-#| after the ":". These must specify a SUFFIX used to name the output of the
-#| corresponding COMMAND, along with the COMMAND name and any ARGS to it.
-#|
-#| Then each COMMAND is executed, as if by a POSIX "execvp" function, once for
-#| each embedded code object that was created in OUTDIR. (Note: Typically this
-#| means the user must ensure the commands are present in at least one
-#| directory of the "PATH" environment variable.) For each execution of
-#| COMMAND:
-#|
-#| If REPLACE-STRING is specified, all instances of REPLACE-STRING in ARGS are
-#| replaced with the file path of the extracted code object before executing
-#| COMMAND.
-#|
-#| The standard input is redirected from the extracted code object.
-#|
-#| If SUFFIX is "-" the standard output is not redirected. If SUFFIX is "!" the
-#| standard output is redirected to /dev/null. Otherwise, the standard output
-#| is redirected to files named by the file path of the extracted code object
-#| with SUFFIX appended.
-#|
-#| Note: The executables roc-obj-ls, roc-obj-extract, and llvm-objdump (in the
-#| case of disassembly requested using the -d flag) are searched for in a
-#| unique way. A series of directories are searched, some conditionally, until
-#| a suitable executable is found. If all directories are searched without
-#| finding the executable, an error occurs. The first directory searched is the
-#| one containing the hard-link to the roc-obj being executed, known as the
-#| "base directory". Next, if the environment variable HIP_CLANG_PATH is set,
-#| it is searched; otherwise, the base directory path is appended with
-#| "../../llvm/bin" and it is searched. Finally, the PATH is searched as if by
-#| a POSIX "execvp" function.
-#|
-#| Option Descriptions:
-#|   -h, --help              print this help text and exit
-#|   -t, --target-id         only extract code objects from EXECUTABLE whose Target ID
-#|                           matches the POSIX extended regular expression REGEXP
-#|   -o, --outdir            set the output directory, which is created if it
-#|                           does not exist
-#|   -I, --replace-string    replace all occurrences of the literal string
-#|                           REPLACE-STRING in ARGS with the input filename
-#|   -i, --replace           equivalent to -I{}
-#|   -d, --disassemble       diassemble extracted code objects; equivalent to
-#|                           : .s llvm-objdump -d - ;
-#|
-#| Example Usage:
-#|
-#| Extract all code objects embedded in a.so:
-#| $ roc-obj a.so
-#|
-#| Extract all code objects embedded in a.so, b.so, and c.so:
-#| $ roc-obj a.so b.so c.so
-#|
-#| Extract all code objects embedded in a.so with "gfx9" in their Target ID:
-#| $ roc-obj -t gfx9 a.so
-#|
-#| Extract all code objects embedded in a.so into output/ (creating it if needed):
-#| $ roc-obj -o output/ a.so
-#|
-#| Extract all code objects embedded in a.so with "gfx9" in their Target ID
-#| into output/ (creating it if needed):
-#| $ roc-obj -t gfx9 -o output/ a.so
-#|
-#| Extract all code objects embedded in a.so, and then disassemble each of them
-#| to files ending with .s:
-#| $ roc-obj -d a.so
-#|
-#| Extract all code objects embedded in a.so, and count the number of bytes in
-#| each, writing the results to files ending with .count:
-#| $ roc-obj a.so : .count wc -c
-#|
-#| Extract all code objects embedded in a.so, and inspect their ELF headers
-#| using llvm-readelf (which will not read from standard input), writing to
-#| files ending with .hdr:
-#| $ roc-obj -I'{}' a.so : .hdr llvm-readelf -h '{}'
-#|
-#| Extract all code objects embedded in a.so, and then extract each of their
-#| .text sections using llvm-objcopy (which won't read from standard input
-#| or write to standard output):
-#| $ roc-obj -I'{}' a.so : ! llvm-objcopy -O binary :only-section=.text '{}' '{}.text'
-#|
-#| Extract all code objects embedded in a.so, b.so, and c.so with target
-#| feature xnack disabled into directory out/. Then, for each:
-#| Write the size in bytes into a file ending with .count, and
-#| Write a textual description of the ELF headers to a file ending with .hdr, and
-#| Extract the .text section to a file ending with .text
-#| $ roc-obj -I'{}' -t xnack- -o out/ a.so b.so c.so : \
-#|     .count wc -c \;
-#|     .hdr llvm-readelf -h '{}' \;
-#|     ! llvm-objcopy -O binary --only-section=.text '{}' '{}.text'
-
-set -euo pipefail
-
-usage() {
-  sed -n 's/^#| \?\(.*\)$/\1/p' "$0"
-}
-
-usage_then_exit() {
-  local -r status="$1"; shift
-  usage >&$(( status ? 2 : 1 ))
-  exit "$status"
-}
-
-fail() {
-  printf "error: %s\n" "$*" >&2
-  exit 1
-}
-
-# Account for the fact that we do not necessarily put ROCm tools in the PATH,
-# nor do we have a single, unified ROCm "bin/" directory.
-#
-# Note that this is only used for roc-obj-ls, roc-obj-extract, and "shortcut"
-# options like -d, and the user can still use any copy of llvm-* by explicitly
-# invoking it with a full path, e.g. : /path/to/llvm-* ... ;
-find_rocm_executable_or_fail() {
-  local -r command="$1"; shift
-  local file
-  local searched=()
-  for dir in "$BASE_DIR" "${HIP_CLANG_PATH:-"$BASE_DIR/../../llvm/bin"}"; do
-    file="$dir/$command"
-    if [[ -x $file ]]; then
-      printf "%s" "$file"
-      return
-    else
-      searched+=("$dir")
-    fi
-  done
-  if hash "$command" 2>/dev/null; then
-    printf "%s" "$command"
-  else
-    fail could not find "$command" in "${searched[*]}" or PATH
-  fi
-}
-
-# Extract the embedded code objects of the executable file given as the first
-# argument into OPT_OUTDIR, filtering them via OPT_TARGET_ID.
-#
-# Deletes any resulting files which are empty, and prints the paths of the
-# remaining files.
-extract() {
-  local -r executable="$1"; shift
-  local prefix
-  prefix="$(basename -- "$executable")"
-  # We want the shell to split the result of roc-obj-ls on whitespace, as
-  # neither the Target ID nor the URI can have embedded spaces.
-  # shellcheck disable=SC2046
-  set -- $(roc-obj-ls -v -- "$executable" | awk "NR>2 && \$1~/$OPT_TARGET_ID/")
-  while (( $# )); do
-    local output="$prefix:$1"; shift
-    local uri="$1"; shift
-    [[ -n $OPT_OUTDIR ]] && output="$OPT_OUTDIR/$output"
-    roc-obj-extract -o - -- "$uri" >"$output"
-    if [[ -s $output ]]; then
-      printf '%s\n' "$output"
-    else
-      rm "$output"
-    fi
-  done
-  (( $# )) && fail expected even number of fields from roc-obj-ls
-}
-
-# Run a command over a list of inputs, naming output files with the supplied
-# suffix and applying OPT_REPLACE_STRING if needed.
-#
-# Arguments are of the form:
-# $suffix $command $args... ; $inputs
-run_command() {
-  local -r suffix="$1"; shift
-  local -r command="$1"; shift
-  local args=()
-  while (( $# )); do
-    local arg="$1"; shift
-    [[ $arg == ';' ]] && break
-    args+=("$arg")
-  done
-  local inputs=("$@")
-  for input in "${inputs[@]}"; do
-    case "$suffix" in
-      '-') output=/dev/stdout;;
-      '!') output=/dev/null;;
-      *) output="$input$suffix";;
-    esac
-    "$command" "${args[@]//$OPT_REPLACE_STRING/$input}" <"$input" >"$output"
-  done
-}
-
-main() {
-  [[ -n $OPT_OUTDIR ]] && mkdir -p "$OPT_OUTDIR"
-  local inputs=()
-  while (( $# )); do
-    local executable="$1"; shift
-    [[ $executable == : ]] && break
-    # Append the file paths extracted from $executable to $inputs
-    readarray -t -O "${#inputs[@]}" inputs < <(extract "$executable")
-  done
-  (( ${#inputs[@]} )) || fail no executables specified
-  while (( $# )); do
-    local suffix="$1"; shift
-    local command="$1"; shift
-    local args=()
-    while (( $# )); do
-      local arg="$1"; shift
-      [[ $arg == \; ]] && break
-      args+=("$arg")
-    done
-    run_command "$suffix" "$command" "${args[@]}" \; "${inputs[@]}"
-  done
-  (( OPT_DISASSEMBLE )) && run_command .s "$OBJDUMP" -d - \; "${inputs[@]}"
-}
-
-OPT_TARGET_ID=''
-OPT_OUTDIR=''
-OPT_REPLACE_STRING=''
-OPT_DISASSEMBLE=0
-! getopt -T || fail util-linux enhanced getopt required
-getopt="$(getopt -o +hg:o:I:id \
-          --long help,target-id:,outdir:,replace:,replace-default,disassemble \
-          -n roc-obj -- "$@")"
-eval set -- "$getopt"
-unset getopt
-while true; do
-  case "$1" in
-    -h | --help) usage_then_exit 0;;
-    -t | --target-id) OPT_TARGET_ID="${2//\//\\\/}"; shift 2;;
-    -o | --outdir) OPT_OUTDIR="$2"; shift 2;;
-    -I | --replace-string) OPT_REPLACE_STRING="$2"; shift 2;;
-    -i | --replace) OPT_REPLACE_STRING='{}'; shift;;
-    -d | --disassemble) OPT_DISASSEMBLE=1; shift;;
-    --) shift; break;;
-    *) usage_then_exit 1;;
-  esac
-done
-readonly -- OPT_TARGET_ID OPT_OUTDIR OPT_REPLACE_STRING OPT_DISASSEMBLE
-
-# We expect to be installed as ROCM_PATH/hip/bin/roc-obj, which means BASE_DIR
-# is ROCM_PATH/hip/bin.
-readonly BASE_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
-(( OPT_DISASSEMBLE )) \
-  && readonly OBJDUMP="$(find_rocm_executable_or_fail llvm-objdump)"
-readonly ROC_OBJ_LS="$(find_rocm_executable_or_fail roc-obj-ls)"
-readonly ROC_OBJ_EXTRACT="$(find_rocm_executable_or_fail roc-obj-extract)"
-
-main "$@"
diff --git a/cmake/FindROCR.cmake b/cmake/FindROCR.cmake
new file mode 100644
index 0000000000..d2c45e9596
--- /dev/null
+++ b/cmake/FindROCR.cmake
@@ -0,0 +1,35 @@
+# Copyright (C) 2020-2021 Advanced Micro Devices, Inc. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Try to find ROCR (Radeon Open Compute Runtime)
+#
+# Once found, this will define:
+#   - ROCR_FOUND     - ROCR status (found or not found)
+#   - ROCR_INCLUDES  - Required ROCR include directories
+#   - ROCR_LIBRARIES - Required ROCR libraries
+find_path(FIND_ROCR_INCLUDES hsa.h HINTS /opt/rocm/include /opt/rocm/hsa/include PATH_SUFFIXES hsa)
+find_library(FIND_ROCR_LIBRARIES hsa-runtime64 HINTS /opt/rocm/lib /opt/rocm/hsa/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ROCR DEFAULT_MSG
+                                  FIND_ROCR_INCLUDES FIND_ROCR_LIBRARIES)
+mark_as_advanced(FIND_ROCR_INCLUDES FIND_ROCR_LIBRARIES)
+
+set(ROCR_INCLUDES ${FIND_ROCR_INCLUDES})
+set(ROCR_LIBRARIES ${FIND_ROCR_LIBRARIES})
diff --git a/cmake/FindROCT.cmake b/cmake/FindROCT.cmake
new file mode 100644
index 0000000000..9d55f3f40a
--- /dev/null
+++ b/cmake/FindROCT.cmake
@@ -0,0 +1,35 @@
+# Copyright (C) 2020-2021 Advanced Micro Devices, Inc. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Try to find ROCT (Radeon Open Compute Thunk)
+#
+# Once found, this will define:
+#   - ROCT_FOUND     - ROCT status (found or not found)
+#   - ROCT_INCLUDES  - Required ROCT include directories
+#   - ROCT_LIBRARIES - Required ROCT libraries
+find_path(FIND_ROCT_INCLUDES hsakmt.h HINTS /opt/rocm/include)
+find_library(FIND_ROCT_LIBRARIES hsakmt HINTS /opt/rocm/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ROCT DEFAULT_MSG
+                                  FIND_ROCT_INCLUDES FIND_ROCT_LIBRARIES)
+mark_as_advanced(FIND_ROCT_INCLUDES FIND_ROCT_LIBRARIES)
+
+set(ROCT_INCLUDES ${FIND_ROCT_INCLUDES})
+set(ROCT_LIBRARIES ${FIND_ROCT_LIBRARIES})
diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index ee1b379d2c..c4f11a27ff 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -31,7 +31,6 @@
 - [How to create a guard for code that is specific to the host or the GPU?](#how-to-create-a-guard-for-code-that-is-specific-to-the-host-or-the-gpu)
 - [Why _OpenMP is undefined when compiling with -fopenmp?](#why-_openmp-is-undefined-when-compiling-with--fopenmp)
 - [Does the HIP-Clang compiler support extern shared declarations?](#does-the-hip-clang-compiler-support-extern-shared-declarations)
-- [I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu: Unable to find code object for all current devices?](#i-have-multiple-hip-enabled-devices-and-i-am-getting-an-error-message-hipErrorNoBinaryForGpu-unable-to-find-code-object-for-all-current-devices)
 <!-- tocstop -->
 
 ### What APIs and features does HIP support?
@@ -243,15 +242,4 @@ When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may gen
 Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
 
 Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
-extern __shared__ type var[];
-
-### I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu Unable to find code object for all current devices?
-
-This error message is seen due to the fact that you do not have valid code object for all of your devices.
-
-If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`.
-
-If you have a precompiled application/library (like rocblas, tensorflow etc) which gives you such error, there are one of two possibilities.
-
- - The application/library does not ship code object bundles for *all* of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
- - The application/library does not ship code object bundles for *some* of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
\ No newline at end of file
+extern __shared__ type var[];
\ No newline at end of file
diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index a3fd0f8f26..526c1538c9 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -172,8 +172,8 @@ Previously, it was essential to declare dynamic shared memory using the HIP_DYNA
 Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required..
 
 ### `__managed__`
-Managed memory, except the `__managed__` keyword, are supported in HIP combined host/device compilation.
-Support of `__managed__` keyword is under development.
+Managed memory, including the `__managed__` keyword, are supported in HIP combined host/device compilation.
+Support of `__managed__` keyword in hipRTC and dynamically loaded code objects is under development.
 
 ### `__restrict__`
 The `__restrict__` keyword tells the compiler that the associated memory pointer will not alias with any other pointer in the kernel or function.  This feature can help the compiler generate better code. In most cases, all pointer arguments must use this keyword to realize the benefit.
@@ -511,72 +511,40 @@ Returns the value of counter that is incremented every clock cycle on device. Di
 
 Atomic functions execute as read-modify-write operations residing in global or shared memory. No other device or thread can observe or modify the memory location during an atomic operation. If multiple instructions from different devices or threads target the same memory location, the instructions are serialized in an undefined order.
 
-HIP adds new APIs with _system as suffix to support system scope atomic operations. For example,  atomicAnd atomic is dedicated to the GPU device, atomicAnd_system will allow developers to extend the atomic operation to system scope, from the GPU device to other CPUs and GPU devices in the system.  
-
 HIP supports the following atomic operations.
 
-| **Function**                                                                                                         |  **Supported in HIP** |  **Supported in CUDA** |
-| -------------------------------------------------------------------------------------------------------------------- | --------------------- | ---------------------- |
-| int atomicAdd(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicAdd_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicAdd(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicAdd_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| unsigned long long atomicAdd(unsigned long long* address,unsigned long long val)                                     |  ✓                    |  ✓                     |
-| unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val)                             |  ✓                    |  ✓                     |
-| float atomicAdd(float* address, float val)                                                                           |  ✓                    |  ✓                     |
-| float atomicAdd_system(float* address, float val)                                                                    |  ✓                    |  ✓                     |
-| double atomicAdd(double* address, double val)                                                                        |  ✓                    |  ✓                     |
-| double atomicAdd_system(double* address, double val)                                                                 |  ✓                    |  ✓                     |
-| int atomicSub(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicSub_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicSub(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicSub_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| int atomicExch(int* address, int val)                                                                                |  ✓                    |  ✓                     |
-| int atomicExch_system(int* address, int val)                                                                         |  ✓                    |  ✓                     |
-| unsigned int atomicExch(unsigned int* address,unsigned int val)                                                      |  ✓                    |  ✓                     |
-| unsigned int atomicExch_system(unsigned int* address, unsigned int val)                                              |  ✓                    |  ✓                     |
-| unsigned long long atomicExch(unsigned long long int* address,unsigned long long int val)                            |  ✓                    |  ✓                     |
-| unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val)                            |  ✓                    |  ✓                     |
-| unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val)                            |  ✓                    |  ✓                     |
-| float atomicExch(float* address, float val)                                                                          |  ✓                    |  ✓                     |
-| int atomicMin(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicMin_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicMin(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicMin_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| unsigned long long atomicMin(unsigned long long* address,unsigned long long val)                                     |  ✓                    |  ✓                     |
-| int atomicMax(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicMax_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicMax(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicMax_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| unsigned long long atomicMax(unsigned long long* address,unsigned long long val)                                     |  ✓                    |  ✓                     |
-| unsigned int atomicInc(unsigned int* address)                                                                        |  ✗                    |  ✓                     |
-| unsigned int atomicDec(unsigned int* address)                                                                        |  ✗                    |  ✓                     |
-| int atomicCAS(int* address, int compare, int val)                                                                    |  ✓                    |  ✓                     |
-| int atomicCAS_system(int* address, int compare, int val)                                                             |  ✓                    |  ✓                     |
-| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val)                                  |  ✓                    |  ✓                     |
-| unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val)                         |  ✓                    |  ✓                     |
-| unsigned long long atomicCAS(unsigned long long* address,unsigned long long compare,unsigned long long val)          |  ✓                    |  ✓                     |
-| unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare, unsigned long long val) |  ✓                    |  ✓                     |
-| int atomicAnd(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicAnd_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicAnd(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicAnd_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| unsigned long long atomicAnd(unsigned long long* address,unsigned long long val)                                     |  ✓                    |  ✓                     |
-| unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val)                             |  ✓                    |  ✓                     |
-| int atomicOr(int* address, int val)                                                                                  |  ✓                    |  ✓                     |
-| int atomicOr_system(int* address, int val)                                                                           |  ✓                    |  ✓                     |
-| unsigned int atomicOr(unsigned int* address,unsigned int val)                                                        |  ✓                    |  ✓                     |
-| unsigned int atomicOr_system(unsigned int* address, unsigned int val)                                                |  ✓                    |  ✓                     |
-| unsigned int atomicOr_system(unsigned int* address, unsigned int val)                                                |  ✓                    |  ✓                     |
-| unsigned long long atomicOr(unsigned long long int* address,unsigned long long val)                                  |  ✓                    |  ✓                     |
-| unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val)                              |  ✓                    |  ✓                     |
-| int atomicXor(int* address, int val)                                                                                 |  ✓                    |  ✓                     |
-| int atomicXor_system(int* address, int val)                                                                          |  ✓                    |  ✓                     |
-| unsigned int atomicXor(unsigned int* address,unsigned int val)                                                       |  ✓                    |  ✓                     |
-| unsigned int atomicXor_system(unsigned int* address, unsigned int val)                                               |  ✓                    |  ✓                     |
-| unsigned long long atomicXor(unsigned long long* address,unsigned long long val))                                    |  ✓                    |  ✓                     |
-| unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val)                             |  ✓                    |  ✓                     |
-
+| **Function** | **Supported in HIP** | **Supported in CUDA** |
+| --- | --- | --- |
+| int atomicAdd(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| float atomicAdd(float* address, float val) | ✓ | ✓ |
+| int atomicSub(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicSub(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| int atomicExch(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicExch(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| float atomicExch(float* address, float val) | ✓ | ✓ |
+| int atomicMin(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicMin(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| int atomicMax(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicMax(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| unsigned int atomicInc(unsigned int* address)| ✗ | ✓  |
+| unsigned int atomicDec(unsigned int* address)| ✗ | ✓ |
+| int atomicCAS(int* address, int compare, int val) | ✓ | ✓ |
+| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓ | ✓ |
+| int atomicAnd(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| int atomicOr(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicOr(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
+| int atomicXor(int* address, int val) | ✓ | ✓ |
+| unsigned int atomicXor(unsigned int* address,unsigned int val) | ✓ | ✓ |
+| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ✓ | ✓ |
 
 ### Caveats and Features Under-Development:
 
diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md
index f8fb0583eb..63badd4773 100644
--- a/docs/markdown/hip_programming_guide.md
+++ b/docs/markdown/hip_programming_guide.md
@@ -25,7 +25,7 @@ Numa distance is the measurement of how far between GPU and CPU devices.
 By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using hipSetDevice API to a different GPU will still be able to access the host allocation, but can have longer Numa distance.
 
 ### Managed memory allocation
-Managed memory, except the `__managed__` keyword, are supported in HIP combined host/device compilation.
+Managed memory, including the `__managed__` keyword, are supported in HIP combined host/device compilation.
 The allocation will be automatically managed by AMD HMM (Heterogeneous Memory Management).
 
 In HIP application, there should be the capability check before make managed memory API call hipMallocManaged.
@@ -47,16 +47,6 @@ else {
 ```
 For more details on managed memory APIs, please refer to the documentation HIP-API.pdf.
 
-### HIP Stream Memory Operations
-
-HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added,
-  hipStreamWaitValue32
-  hipStreamWaitValue64
-  hipStreamWriteValue32
-  hipStreamWriteValue64
-
-For more details, please check the documentation HIP-API.pdf.
-
 ### Coherency Controls
 ROCm defines two coherency options for host memory:
 - Coherent memory : Supports fine-grain synchronization while the kernel is running.  For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs.  Synchronization instructions include threadfence_system and C++11-style atomic operations. However, coherent memory cannot be cached by the GPU and thus may have lower performance.
@@ -98,15 +88,6 @@ In case events are used across multiple dispatches, for example, start and stop
 - Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points.  This memory allows in-kernel synchronization commands such as threadfence_system to work transparently.
 - HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
 
-## HIP Runtime Compilation
-HIP now supports runtime compilation (hipRTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation.
-
-hipRTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes.
-
-For more details on hipRTC APIs, refer to HIP-API.pdf in GitHub (https://github.com/RadeonOpenCompute/ROCm).
-
-The link here(https://github.com/ROCm-Developer-Tools/HIP/blob/main/tests/src/hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism.
-
 ## Device-Side Malloc
 
 HIP-Clang currently doesn't supports device-side malloc and free.
diff --git a/docs/markdown/hip_terms.md b/docs/markdown/hip_terms.md
index db9a90eee4..379d08c22f 100644
--- a/docs/markdown/hip_terms.md
+++ b/docs/markdown/hip_terms.md
@@ -1,38 +1,40 @@
 # Table Comparing Syntax for Different Compute APIs
 
-|Term|CUDA|HIP|OpenCL|
-|---|---|---|---|
-|Device|`int deviceId`|`int deviceId`|`cl_device`|
-|Queue|`cudaStream_t`|`hipStream_t`|`cl_command_queue`|
-|Event|`cudaEvent_t`|`hipEvent_t`|`cl_event`|
-|Memory|`void *`|`void *`|`cl_mem`|
+|Term|CUDA|HIP|HC|C++AMP|OpenCL|
+|---|---|---|---|---|---|
+|Device|`int deviceId`|`int deviceId`|`hc::accelerator`|`concurrency::`<br>`accelerator`|`cl_device`
+|Queue|`cudaStream_t`|`hipStream_t`|`hc::`<br>`accelerator_view`|`concurrency::`<br>`accelerator_view`|`cl_command_queue`
+|Event|`cudaEvent_t`|`hipEvent_t`|`hc::`<br>`completion_future`|`concurrency::`<br>`completion_future`|`cl_event`
+|Memory|`void *`|`void *`|`void *`; `hc::array`; `hc::array_view`|`concurrency::array`;<br>`concurrency::array_view`|`cl_mem`
 |||||
-| |grid|grid|NDRange|
-| |block|block|work-group|
-| |thread|thread|work-item|
-| |warp|warp|sub-group|
+| |grid|grid|extent|extent|NDRange
+| |block|block|tile|tile|work-group
+| |thread|thread|thread|thread|work-item
+| |warp|warp|wavefront|N/A|sub-group
 |||||
-|Thread-<br>index | threadIdx.x | threadIdx.x  | get_local_id(0) |
-|Block-<br>index  | blockIdx.x  | blockIdx.x  | get_group_id(0) |
-|Block-<br>dim    | blockDim.x  | blockDim.x  | get_local_size(0) |
-|Grid-dim     | gridDim.x   | gridDim.x  | get_num_groups(0) |
+|Thread-<br>index | threadIdx.x | hipThreadIdx_x | t_idx.local[0] | t_idx.local[0] | get_local_id(0) |
+|Block-<br>index  | blockIdx.x  | hipBlockIdx_x  | t_idx.tile[0]  | t_idx.tile[0]  | get_group_id(0) |
+|Block-<br>dim    | blockDim.x  | hipBlockDim_x  | t_ext.tile_dim[0]| t_idx.tile_dim0 | get_local_size(0) |
+|Grid-dim     | gridDim.x   | hipGridDim_x   | t_ext[0]| t_ext[0] | get_global_size(0) |
 |||||
-|Device Kernel|`__global__`|`__global__`|`__kernel`|
-|Device Function|`__device__`|`__device__`|Implied in device compilation|
-|Host Function|`__host_` (default)|`__host_` (default)|Implied in host compilation|
-|Host + Device Function|`__host__` `__device__`|`__host__` `__device__`| No equivalent|
-|Kernel Launch|`<<< >>>`|`hipLaunchKernel`/`hipLaunchKernelGGL`/`<<< >>>`|`clEnqueueNDRangeKernel`|
+|Device Kernel|`__global__`|`__global__`|lambda inside `hc::`<br>`parallel_for_each` or [[hc]]|`restrict(amp)`|`__kernel`
+|Device Function|`__device__`|`__device__`|`[[hc]]` (detected automatically in many case)|`restrict(amp)`|Implied in device compilation
+|Host Function|`__host_` (default)|`__host_` (default)|`[[cpu]]`  (default)|`restrict(cpu)` (default)|Implied in host compilation.
+|Host + Device Function|`__host__` `__device__`|`__host__` `__device__`|  `[[hc]]` `[[cpu]]`|`restrict(amp,cpu)`|No equivalent
+|Kernel Launch|`<<< >>>`|`hipLaunchKernel`|`hc::`<br>`parallel_for_each`|`concurrency::`<br>`parallel_for_each`|`clEnqueueNDRangeKernel`
 ||||||
-|Global Memory|`__global__`|`__global__`|`__global`|
-|Group Memory|`__shared__`|`__shared__`|`__local`|
-|Constant|`__constant__`|`__constant__`|`__constant`|
+|Global Memory|`__global__`|`__global__`|Unnecessary / Implied|Unnecessary / Implied|`__global`
+|Group Memory|`__shared__`|`__shared__`|`tile_static`|`tile_static`|`__local`
+|Constant|`__constant__`|`__constant__`|Unnecessary / Implied|Unnecessary / Implied|`__constant`
 ||||||
-||`__syncthreads`|`__syncthreads`|`barrier(CLK_LOCAL_MEMFENCE)`|
-|Atomic Builtins|`atomicAdd`|`atomicAdd`|`atomic_add`|
-|Precise Math|`cos(f)`|`cos(f)`|`cos(f)`|
-|Fast Math|`__cos(f)`|`__cos(f)`|`native_cos(f)`|
-|Vector|`float4`|`float4`|`float4`|
+||`__syncthreads`|`__syncthreads`|`tile_static.barrier()`|`t_idx.barrier()`|`barrier(CLK_LOCAL_MEMFENCE)`
+|Atomic Builtins|`atomicAdd`|`atomicAdd`|`hc::atomic_fetch_add`|`concurrency::`<br>`atomic_fetch_add`|`atomic_add`
+|Precise Math|`cos(f)`|`cos(f)`|`hc::`<br>`precise_math::cos(f)`|`concurrency::`<br>`precise_math::cos(f)`|`cos(f)`
+|Fast Math|`__cos(f)`|`__cos(f)`|`hc::`<br>`fast_math::cos(f)`|`concurrency::`<br>`fast_math::cos(f)`|`native_cos(f)`
+|Vector|`float4`|`float4`|`hc::`<br>`short_vector::float4`|`concurrency::`<br>`graphics::float_4`|`float4`
 
 ### Notes
-The indexing functions (starting with `thread-index`) show the terminology for a 1D grid.  Some APIs use reverse order of xyz / 012 indexing for 3D grids.
+1. For HC and C++AMP, assume a captured _tiled_ext_ named "t_ext" and captured _extent_ named "ext".  These languages use captured variables to pass information to the kernel rather than using special built-in functions so the exact variable name may vary.
+2. The indexing functions (starting with `thread-index`) show the terminology for a 1D grid.  Some APIs use reverse order of xyz / 012 indexing for 3D grids.
+3. HC allows tile dimensions to be specified at runtime while C++AMP requires that tile dimensions be specified at compile-time.  Thus hc syntax for tile dims is `t_ext.tile_dim[0]` while C++AMP is t_ext.tile_dim0.
 
diff --git a/docs/markdown/obj_tooling.md b/docs/markdown/obj_tooling.md
index f3c728b197..f107d0a66a 100644
--- a/docs/markdown/obj_tooling.md
+++ b/docs/markdown/obj_tooling.md
@@ -1,37 +1,8 @@
 # ROCm Code Object tooling
 
-ROCm compiler generated code objects (executables, object files, and shared
-object libraries) can be examined and code objects extracted with the following
-tools.
+ROCm compiler generated code objects (executables, object files, and shared object libraries) can be examined and code objects extracted with the following tools.
 
-## roc-obj
-
-High-level wrapper around low-level tooling described below. For a more
-detailed overview, see the help text available with `roc-obj --help`.
-
-### Examples:
-
-#### Extract all ROCm code objects from a list of executables
-    roc-obj executable...
-
-#### Extract all ROCm code objects from a list of executables, and disassemble them
-    roc-obj --disassemble executable...
-    # or
-    roc-obj -d executable...
-
-#### Extract all ROCm code objects from a list of executables into dir/
-    roc-obj --outdir dir/ executable...
-    # or
-    roc-obj -o dir/ executable...
-
-#### Extract only ROCm code objects matching regex over Target ID
-    roc-obj --grep gfx9 executable...
-    # or
-    roc-obj -g gfx9 executable...
-
-## Low-Level Tooling
-
-### URI syntax:
+## URI syntax:
 
   ROCm Code Objects can be listed/accessed using the following URI syntax:
 ```
@@ -46,7 +17,8 @@ detailed overview, see the help text available with `roc-obj --help`.
   Example: file://dir1/dir2/hello_world#offset=133&size=14472
            memory://1234#offset=0x20000&size=3000
 
-### List available ROCm Code Objects: rocm-obj-ls
+
+## List available ROCm Code Objects: rocm-obj-ls
 
   Use this tool to list available ROCm code objects.  Code objects are listed using URI syntax.
 
@@ -56,7 +28,7 @@ detailed overview, see the help text available with `roc-obj --help`.
     -h Show this help message
 
 
-### Extract ROCm Code Objects: rocm-obj-extract
+## Extract ROCm Code Objects: rocm-obj-extract
 
   Extracts available ROCm code objects from specified URI.
 
@@ -72,24 +44,24 @@ detailed overview, see the help text available with `roc-obj --help`.
   Note, when specifying a URI argument to roc-obj-extract, if cut and pasting the output from roc-obj-ls you need to escape the '&' character or your shell will interpret it as the option to run the command as a background process.
   As an example, if roc-obj-ls generates a URI like this ```file://my_exe#offset=24576&size=46816xxi```, you need to use the following argument to roc-obj-extract: ```file://my_exe#offset=24576\&size=46816```
 
-### Examples:
+## Examples:
 
-#### Dump all code objects to current directory:
+### Dump all code objects to current directory:
     roc-obj-ls <exe> | roc-obj-extract
 
-#### Dump the ISA for gfx906:
-    roc-obj-ls -v <exe> | awk '/gfx906/{print $2}' | roc-obj-extract -o - | llvm-objdump -d - > <exe>.gfx906.isa
+### Dump the ISA for gfx906:
+    roc-obj-ls -v <exe> | grep "gfx906" | awk '{print $2}' | roc-obj-extract -o - | llvm-objdump -d - > <exe>.gfx906.isa
 
-#### Check the e_flags of the gfx908 code object:
-    roc-obj-ls -v <exe> | awk '/gfx908/{print $2}' | roc-obj-extract -o - | llvm-readelf -h - | grep Flags
+### Check the e_flags of the gfx908 code object:
+    roc-obj-ls -v <exe> | grep "gfx908" | awk '{print $2}' | roc-obj-extract -o - | llvm-readelf -h - | grep Flags
 
-#### Disassemble the fourth code object:
+### Disassemble the fourth code object:
     roc-obj-ls <exe> | sed -n 4p | roc-obj-extract -o - | llvm-objdump -d -
 
-#### Sort embedded code objects by size:
+### Sort embedded code objects by size:
     for uri in $(roc-obj-ls <exe>); do printf "%d: %s\n" "$(roc-obj-extract -o - "$uri" | wc -c)" "$uri"; done | sort -n
 
-#### Compare disassembly of gfx803 and gfx900 code objects:
+### Compare disassembly of gfx803 and gfx900 code objects:
     dis() { roc-obj-ls -v <exe> | grep "$1" | awk '{print $2}' | roc-obj-extract -o - | llvm-objdump -d -; }
     diff <(dis gfx803) <(dis gfx900)
 
diff --git a/hip-config.cmake.in b/hip-config.cmake.in
index 2c73ed0412..e696226834 100755
--- a/hip-config.cmake.in
+++ b/hip-config.cmake.in
@@ -104,23 +104,11 @@ if(HIP_COMPILER STREQUAL "clang")
   if(NOT HIP_CXX_COMPILER)
     set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
   endif()
-  if(HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
+  if(HIP_CXX_COMPILER MATCHES ".*hipcc")
     execute_process(COMMAND ${HIP_CXX_COMPILER} --version
                     OUTPUT_STRIP_TRAILING_WHITESPACE
-                    OUTPUT_VARIABLE HIP_CXX_COMPILER_VERSION_OUTPUT)
-    # Capture the repo, branch and patch level details of the HIP CXX Compiler.
-    # Ex. clang version 13.0.0 (https://github.com/ROCm-Developer-Tools/HIP main 12345 COMMIT_HASH)
-    # HIP_CLANG_REPO: https://github.com/ROCm-Developer-Tools/HIP
-    # HIP_CLANG_BRANCH: main
-    # HIP_CLANG_PATCH_LEVEL: 12345
-    if(${HIP_CXX_COMPILER_VERSION_OUTPUT} MATCHES "clang version [0-9]+\\.[0-9]+\\.[0-9]+ \\(([^ \n]*) ([^ \n]*) ([^ \n]*)")
-      set(HIP_CLANG_REPO ${CMAKE_MATCH_1})
-      set(HIP_CLANG_BRANCH ${CMAKE_MATCH_2})
-      set(HIP_CLANG_PATCH_LEVEL ${CMAKE_MATCH_3})
-    endif()
-  endif()
-  if(HIP_CXX_COMPILER MATCHES ".*hipcc")
-    if(HIP_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
+                    OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
+    if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
       get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
     endif()
   elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
@@ -135,10 +123,8 @@ if(HIP_COMPILER STREQUAL "clang")
   if(NOT WIN32)
     find_dependency(AMDDeviceLibs)
   endif()
-  set(GPU_DEFAULT_TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx1030")
-  set(GPU_SUPPORTED_TARGETS "gfx701;gfx801;gfx802;gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031")
-  set(GPU_TARGETS "${GPU_DEFAULT_TARGETS}" CACHE STRING "GPU targets to compile for")
-  set_property(CACHE GPU_TARGETS PROPERTY STRINGS ${GPU_SUPPORTED_TARGETS})
+  set(AMDGPU_TARGETS "gfx900;gfx906;gfx908" CACHE STRING "AMD GPU targets to compile for")
+  set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU targets to compile for")
 endif()
 
 if(NOT WIN32)
@@ -153,6 +139,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
 if(NOT WIN32)
   find_dependency(hsa-runtime64)
   find_dependency(Threads)
+  find_dependency(ROCclr)
 endif()
 
 #get_filename_component cannot resolve the symlinks if called from /opt/rocm/lib/hip
@@ -187,6 +174,14 @@ if(HIP_RUNTIME MATCHES "rocclr")
 
   get_target_property(amdhip64_type hip::amdhip64 TYPE)
   message(STATUS "hip::amdhip64 is ${amdhip64_type}")
+  if(${amdhip64_type} STREQUAL "STATIC_LIBRARY")
+    # For cyclic dependence
+    get_target_property(link_interf_libs amdrocclr_static LINK_INTERFACE_LIBRARIES)
+    if(NOT "${link_interf_libs}" MATCHES "hip::amdhip64")
+      # Prevent repeatedly linking dependence
+      target_link_libraries(amdrocclr_static INTERFACE hip::amdhip64)
+    endif()
+  endif()
 
   if(NOT WIN32)
     set_target_properties(hip::device PROPERTIES
diff --git a/include/hip/amd_detail/channel_descriptor.h b/include/hip/amd_detail/channel_descriptor.h
new file mode 100644
index 0000000000..d23f341ef9
--- /dev/null
+++ b/include/hip/amd_detail/channel_descriptor.h
@@ -0,0 +1,348 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include <hip/hip_common.h>
+#include <hip/amd_detail/driver_types.h>
+#include <hip/amd_detail/hip_vector_types.h>
+
+#ifdef __cplusplus
+
+extern "C" HIP_PUBLIC_API
+hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <typename T>
+static inline hipChannelFormatDesc hipCreateChannelDesc() {
+    return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+    int e = (int)sizeof(char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__  // vector3 is the same as vector4
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+#else
+
+struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                 enum hipChannelFormatKind f);
+
+#endif
+
+#endif
diff --git a/include/hip/amd_detail/concepts.hpp b/include/hip/amd_detail/concepts.hpp
new file mode 100644
index 0000000000..373cefb292
--- /dev/null
+++ b/include/hip/amd_detail/concepts.hpp
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+namespace hip_impl  // Documentation only.
+{
+#define requires(...)
+
+#define FunctionalProcedure typename
+}  // namespace hip_impl
diff --git a/include/hip/amd_detail/cuda/cuda.h b/include/hip/amd_detail/cuda/cuda.h
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/include/hip/amd_detail/cuda/cuda.h
@@ -0,0 +1 @@
+
diff --git a/include/hip/amd_detail/cuda/math_functions.h b/include/hip/amd_detail/cuda/math_functions.h
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/include/hip/amd_detail/cuda/math_functions.h
@@ -0,0 +1 @@
+
diff --git a/include/hip/amd_detail/device_functions.h b/include/hip/amd_detail/device_functions.h
new file mode 100644
index 0000000000..f57460962b
--- /dev/null
+++ b/include/hip/amd_detail/device_functions.h
@@ -0,0 +1,1347 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
+
+#include "host_defines.h"
+#include "math_fwd.h"
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+#include <stddef.h>
+#endif // !defined(__HIPCC_RTC__)
+
+#include <hip/hip_vector_types.h>
+#include <hip/amd_detail/device_library_decls.h>
+#include <hip/amd_detail/llvm_intrinsics.h>
+
+#if __HIP_CLANG_ONLY__
+extern "C" __device__ int printf(const char *fmt, ...);
+#else
+template <typename... All>
+static inline __device__ void printf(const char* format, All... all) {}
+#endif // __HIP_CLANG_ONLY__
+
+/*
+Integer Intrinsics
+*/
+
+// integer intrinsic function __poc __clz __ffs __brev
+__device__ static inline unsigned int __popc(unsigned int input) {
+    return __builtin_popcount(input);
+}
+__device__ static inline unsigned int __popcll(unsigned long long int input) {
+    return __builtin_popcountll(input);
+}
+
+__device__ static inline int __clz(int input) {
+    return __ockl_clz_u32((uint)input);
+}
+
+__device__ static inline int __clzll(long long int input) {
+    return __ockl_clz_u64((uint64_t)input);
+}
+
+__device__ static inline unsigned int __ffs(unsigned int input) {
+    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffsll(unsigned long long int input) {
+    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffs(int input) {
+    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffsll(long long int input) {
+    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
+}
+
+__device__ static inline unsigned int __brev(unsigned int input) {
+    return __builtin_bitreverse32(input);
+}
+
+__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
+    return __builtin_bitreverse64(input);
+}
+
+__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
+    return input == 0 ? -1 : __builtin_ctzl(input);
+}
+
+__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
+    uint32_t offset = src1 & 31;
+    uint32_t width = src2 & 31;
+    return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
+}
+
+__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
+    uint64_t offset = src1 & 63;
+    uint64_t width = src2 & 63;
+    return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
+}
+
+__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
+    uint32_t offset = src2 & 31;
+    uint32_t width = src3 & 31;
+    uint32_t mask = (1 << width) - 1;
+    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
+    uint64_t offset = src2 & 63;
+    uint64_t width = src3 & 63;
+    uint64_t mask = (1ULL << width) - 1;
+    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+__device__ static unsigned int __hadd(int x, int y);
+__device__ static int __mul24(int x, int y);
+__device__ static long long int __mul64hi(long long int x, long long int y);
+__device__ static int __mulhi(int x, int y);
+__device__ static int __rhadd(int x, int y);
+__device__ static unsigned int __sad(int x, int y,unsigned int z);
+__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
+__device__ static int __umul24(unsigned int x, unsigned int y);
+__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
+__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
+__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
+
+struct ucharHolder {
+    union {
+        unsigned char c[4];
+        unsigned int ui;
+    };
+} __attribute__((aligned(4)));
+
+struct uchar2Holder {
+    union {
+        unsigned int ui[2];
+        unsigned char c[8];
+    };
+} __attribute__((aligned(8)));
+
+__device__
+static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+    struct uchar2Holder cHoldVal;
+    struct ucharHolder cHoldKey;
+    cHoldKey.ui = s;
+    cHoldVal.ui[0] = x;
+    cHoldVal.ui[1] = y;
+    unsigned int result;
+    result = cHoldVal.c[cHoldKey.c[0] & 0x07];
+    result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
+    result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
+    result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
+    return result;
+}
+
+__device__ static inline unsigned int __hadd(int x, int y) {
+    int z = x + y;
+    int sign = z & 0x8000000;
+    int value = z & 0x7FFFFFFF;
+    return ((value) >> 1 || sign);
+}
+
+__device__ static inline int __mul24(int x, int y) {
+    return __ockl_mul24_i32(x, y);
+}
+
+__device__ static inline long long __mul64hi(long long int x, long long int y) {
+    ulong x0 = (ulong)x & 0xffffffffUL;
+    long x1 = x >> 32;
+    ulong y0 = (ulong)y & 0xffffffffUL;
+    long y1 = y >> 32;
+    ulong z0 = x0*y0;
+    long t = x1*y0 + (z0 >> 32);
+    long z1 = t & 0xffffffffL;
+    long z2 = t >> 32;
+    z1 = x0*y1 + z1;
+    return x1*y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline int __mulhi(int x, int y) {
+    return __ockl_mul_hi_i32(x, y);
+}
+
+__device__ static inline int __rhadd(int x, int y) {
+    int z = x + y + 1;
+    int sign = z & 0x8000000;
+    int value = z & 0x7FFFFFFF;
+    return ((value) >> 1 || sign);
+}
+__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
+    return x > y ? x - y + z : y - x + z;
+}
+__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
+    return (x + y) >> 1;
+}
+__device__ static inline int __umul24(unsigned int x, unsigned int y) {
+    return __ockl_mul24_u32(x, y);
+}
+
+__device__
+static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
+    ulong x0 = x & 0xffffffffUL;
+    ulong x1 = x >> 32;
+    ulong y0 = y & 0xffffffffUL;
+    ulong y1 = y >> 32;
+    ulong z0 = x0*y0;
+    ulong t = x1*y0 + (z0 >> 32);
+    ulong z1 = t & 0xffffffffUL;
+    ulong z2 = t >> 32;
+    z1 = x0*y1 + z1;
+    return x1*y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
+    return __ockl_mul_hi_u32(x, y);
+}
+__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
+    return (x + y + 1) >> 1;
+}
+__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+    return __ockl_sadd_u32(x, y, z);
+}
+
+__device__ static inline unsigned int __lane_id() {
+    return  __builtin_amdgcn_mbcnt_hi(
+        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+__device__
+static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
+
+__device__
+static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
+
+/*
+HIP specific device functions
+*/
+
+__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+    return tmp.u;
+}
+
+__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+    return tmp.f;
+}
+
+__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+    return tmp.u;
+}
+
+__device__ static inline float __hip_ds_permutef(int index, float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+    return tmp.u;
+}
+
+#define __hip_ds_swizzle(src, pattern)  __hip_ds_swizzle_N<(pattern)>((src))
+#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
+
+template <int pattern>
+__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+    return tmp.u;
+}
+
+template <int pattern>
+__device__ static inline float __hip_ds_swizzlef_N(float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+    return tmp.f;
+}
+
+#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
+  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ static inline int __hip_move_dpp_N(int src) {
+    return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
+                                    bound_ctrl);
+}
+
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__
+inline
+int __shfl(int var, int src_lane, int width = warpSize) {
+    int self = __lane_id();
+    int index = src_lane + (self & ~(width-1));
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
+     union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl(tmp.i, src_lane, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl(float var, int src_lane, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl(tmp.i, src_lane, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl(double var, int src_lane, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl(long var, int src_lane, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
+long long __shfl(long long var, int src_lane, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
+    int self = __lane_id();
+    int index = self - lane_delta;
+    index = (index < (self & ~(width-1)))?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_up(tmp.i, lane_delta, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_up(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+
+__device__
+inline
+unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+
+__device__
+inline
+long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
+    int self = __lane_id();
+    int index = self + lane_delta;
+    index = (int)((self&(width-1))+lane_delta) >= width?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_down(tmp.i, lane_delta, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_down(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_xor(int var, int lane_mask, int width = warpSize) {
+    int self = __lane_id();
+    int index = self^lane_mask;
+    index = index >= ((self+width)&~(width-1))?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_xor(float var, int lane_mask, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_xor(double var, int lane_mask, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_xor(long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+#define MASK1 0x00ff00ff
+#define MASK2 0xff00ff00
+
+__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.w & MASK1;
+    unsigned one2 = in2.w & MASK1;
+    out.w = (one1 + one2) & MASK1;
+    one1 = in1.w & MASK2;
+    one2 = in2.w & MASK2;
+    out.w = out.w | ((one1 + one2) & MASK2);
+    return out;
+}
+
+__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.w & MASK1;
+    unsigned one2 = in2.w & MASK1;
+    out.w = (one1 - one2) & MASK1;
+    one1 = in1.w & MASK2;
+    one2 = in2.w & MASK2;
+    out.w = out.w | ((one1 - one2) & MASK2);
+    return out;
+}
+
+__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+    char4 out;
+    unsigned one1 = in1.w & MASK1;
+    unsigned one2 = in2.w & MASK1;
+    out.w = (one1 * one2) & MASK1;
+    one1 = in1.w & MASK2;
+    one2 = in2.w & MASK2;
+    out.w = out.w | ((one1 * one2) & MASK2);
+    return out;
+}
+
+/*
+ * Rounding modes are not yet supported in HIP
+ * TODO: Conversion functions are not correct, need to fix when BE is ready
+*/
+
+__device__ static inline float __double2float_rd(double x) { return (double)x; }
+__device__ static inline float __double2float_rn(double x) { return (double)x; }
+__device__ static inline float __double2float_ru(double x) { return (double)x; }
+__device__ static inline float __double2float_rz(double x) { return (double)x; }
+
+__device__ static inline int __double2hiint(double x) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+    int tmp[2];
+    __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+    return tmp[1];
+}
+__device__ static inline int __double2loint(double x) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+    int tmp[2];
+    __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+    return tmp[0];
+}
+
+__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
+__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
+__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
+__device__ static inline int __double2int_rz(double x) { return (int)x; }
+
+__device__ static inline long long int __double2ll_rd(double x) {
+  return (long long)__ocml_floor_f64(x);
+}
+__device__ static inline long long int __double2ll_rn(double x) {
+  return (long long)__ocml_rint_f64(x);
+}
+__device__ static inline long long int __double2ll_ru(double x) {
+  return (long long)__ocml_ceil_f64(x);
+}
+__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
+
+__device__ static inline unsigned int __double2uint_rd(double x) {
+  return (unsigned int)__ocml_floor_f64(x);
+}
+__device__ static inline unsigned int __double2uint_rn(double x) {
+  return (unsigned int)__ocml_rint_f64(x);
+}
+__device__ static inline unsigned int __double2uint_ru(double x) {
+  return (unsigned int)__ocml_ceil_f64(x);
+}
+__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __double2ull_rd(double x) {
+  return (unsigned long long int)__ocml_floor_f64(x);
+}
+__device__ static inline unsigned long long int __double2ull_rn(double x) {
+  return (unsigned long long int)__ocml_rint_f64(x);
+}
+__device__ static inline unsigned long long int __double2ull_ru(double x) {
+  return (unsigned long long int)__ocml_ceil_f64(x);
+}
+__device__ static inline unsigned long long int __double2ull_rz(double x) {
+  return (unsigned long long int)x;
+}
+
+__device__ static inline long long int __double_as_longlong(double x) {
+    static_assert(sizeof(long long) == sizeof(double), "");
+
+    long long tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+/*
+__device__ unsigned short __float2half_rn(float x);
+__device__ float __half2float(unsigned short);
+
+The above device function are not a valid .
+Use
+__device__ __half __float2half_rn(float x);
+__device__ float __half2float(__half);
+from hip_fp16.h
+
+CUDA implements half as unsigned short whereas, HIP doesn't.
+
+*/
+
+__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
+__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
+__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
+__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
+
+__device__ static inline long long int __float2ll_rd(float x) {
+  return (long long int)__ocml_floor_f32(x);
+}
+__device__ static inline long long int __float2ll_rn(float x) {
+  return (long long int)__ocml_rint_f32(x);
+}
+__device__ static inline long long int __float2ll_ru(float x) {
+  return (long long int)__ocml_ceil_f32(x);
+}
+__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
+
+__device__ static inline unsigned int __float2uint_rd(float x) {
+  return (unsigned int)__ocml_floor_f32(x);
+}
+__device__ static inline unsigned int __float2uint_rn(float x) {
+  return (unsigned int)__ocml_rint_f32(x);
+}
+__device__ static inline unsigned int __float2uint_ru(float x) {
+  return (unsigned int)__ocml_ceil_f32(x);
+}
+__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __float2ull_rd(float x) {
+  return (unsigned long long int)__ocml_floor_f32(x);
+}
+__device__ static inline unsigned long long int __float2ull_rn(float x) {
+  return (unsigned long long int)__ocml_rint_f32(x);
+}
+__device__ static inline unsigned long long int __float2ull_ru(float x) {
+  return (unsigned long long int)__ocml_ceil_f32(x);
+}
+__device__ static inline unsigned long long int __float2ull_rz(float x) {
+  return (unsigned long long int)x;
+}
+
+__device__ static inline int __float_as_int(float x) {
+    static_assert(sizeof(int) == sizeof(float), "");
+
+    int tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline unsigned int __float_as_uint(float x) {
+    static_assert(sizeof(unsigned int) == sizeof(float), "");
+
+    unsigned int tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline double __hiloint2double(int hi, int lo) {
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
+    double tmp1;
+    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+
+    return tmp1;
+}
+
+__device__ static inline double __int2double_rn(int x) { return (double)x; }
+
+__device__ static inline float __int2float_rd(int x) { return (float)x; }
+__device__ static inline float __int2float_rn(int x) { return (float)x; }
+__device__ static inline float __int2float_ru(int x) { return (float)x; }
+__device__ static inline float __int2float_rz(int x) { return (float)x; }
+
+__device__ static inline float __int_as_float(int x) {
+    static_assert(sizeof(float) == sizeof(int), "");
+
+    float tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
+
+__device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
+
+__device__ static inline double __longlong_as_double(long long int x) {
+    static_assert(sizeof(double) == sizeof(long long), "");
+
+    double tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline double __uint2double_rn(int x) { return (double)x; }
+
+__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
+
+__device__ static inline float __uint_as_float(unsigned int x) {
+   static_assert(sizeof(float) == sizeof(unsigned int), "");
+
+    float tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
+
+__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
+
+#if __HIP_CLANG_ONLY__
+
+// Clock functions
+__device__ long long int __clock64();
+__device__ long long int __clock();
+__device__ long long int clock64();
+__device__ long long int clock();
+// hip.amdgcn.bc - named sync
+__device__ void __named_sync(int a, int b);
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+// Clock function to return GPU core cycle count.
+// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
+// through hipDeviceAttributeClockRate attribute.
+__device__
+inline  __attribute((always_inline))
+long long int __clock64() {
+#if __has_builtin(__builtin_amdgcn_s_memtime)
+  // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
+  return (long long int) __builtin_amdgcn_s_memtime();
+#else
+  // Subject to change when better solution available
+  return (long long int) __builtin_readcyclecounter();
+#endif
+}
+
+__device__
+inline __attribute((always_inline))
+long long int  __clock() { return __clock64(); }
+
+// Clock function to return wall clock count at a constant frequency. The interface to query
+// the frequency will be implemented.
+__device__
+inline  __attribute__((always_inline))
+long long int wall_clock64() {
+#if __has_builtin(__builtin_amdgcn_s_memrealtime)
+  // Exists since gfx8
+  return (long long int) __builtin_amdgcn_s_memrealtime();
+#else
+  return -1; // Negative return means __builtin_amdgcn_s_memrealtime unavailable.
+#endif
+}
+
+__device__
+inline  __attribute__((always_inline))
+long long int clock64() { return __clock64(); }
+
+__device__
+inline __attribute__((always_inline))
+long long int  clock() { return __clock(); }
+
+// hip.amdgcn.bc - named sync
+__device__
+inline
+void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
+
+#endif // __HIP_DEVICE_COMPILE__
+
+// warp vote function __all __any __ballot
+__device__
+inline
+int __all(int predicate) {
+    return __ockl_wfall_i32(predicate);
+}
+
+__device__
+inline
+int __any(int predicate) {
+    return __ockl_wfany_i32(predicate);
+}
+
+// XXX from llvm/include/llvm/IR/InstrTypes.h
+#define ICMP_NE 33
+
+__device__
+inline
+unsigned long long int __ballot(int predicate) {
+    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
+}
+
+__device__
+inline
+unsigned long long int __ballot64(int predicate) {
+    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
+}
+
+// hip.amdgcn.bc - lanemask
+__device__
+inline
+uint64_t  __lanemask_gt()
+{
+    uint32_t lane = __ockl_lane_u32();
+    if (lane == 63)
+      return 0;
+    uint64_t ballot = __ballot64(1);
+    uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+    return mask & ballot;
+}
+
+__device__
+inline
+uint64_t __lanemask_lt()
+{
+    uint32_t lane = __ockl_lane_u32();
+    int64_t ballot = __ballot64(1);
+    uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+    return mask & ballot;
+}
+
+__device__
+inline
+uint64_t  __lanemask_eq()
+{
+    uint32_t lane = __ockl_lane_u32();
+    int64_t mask = ((uint64_t)1 << lane);
+    return mask;
+}
+
+
+__device__ inline void* __local_to_generic(void* p) { return p; }
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__
+inline
+void* __get_dynamicgroupbaseptr()
+{
+    // Get group segment base pointer.
+    return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
+}
+#else
+__device__
+void* __get_dynamicgroupbaseptr();
+#endif // __HIP_DEVICE_COMPILE__
+
+__device__
+inline
+void *__amdgcn_get_dynamicgroupbaseptr() {
+    return __get_dynamicgroupbaseptr();
+}
+
+// Memory Fence Functions
+__device__
+inline
+static void __threadfence()
+{
+  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
+}
+
+__device__
+inline
+static void __threadfence_block()
+{
+  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
+}
+
+__device__
+inline
+static void __threadfence_system()
+{
+  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
+}
+
+// abort
+__device__
+inline
+__attribute__((weak))
+void abort() {
+    return __builtin_trap();
+}
+
+// The noinline attribute helps encapsulate the printf expansion,
+// which otherwise has a performance impact just by increasing the
+// size of the calling function. Additionally, the weak attribute
+// allows the function to exist as a global although its definition is
+// included in every compilation unit.
+#if defined(_WIN32) || defined(_WIN64)
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
+    // FIXME: Need `wchar_t` support to generate assertion message.
+    __builtin_trap();
+}
+#else /* defined(_WIN32) || defined(_WIN64) */
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void __assert_fail(const char *assertion,
+                   const char *file,
+                   unsigned int line,
+                   const char *function)
+{
+    printf("%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
+           function, assertion);
+    __builtin_trap();
+}
+
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void __assertfail(const char *assertion,
+                  const char *file,
+                  unsigned int line,
+                  const char *function,
+                  size_t charsize)
+{
+    // ignore all the args for now.
+    __builtin_trap();
+}
+#endif /* defined(_WIN32) || defined(_WIN64) */
+
+__device__
+inline
+static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
+{
+    if (flags) {
+        __atomic_work_item_fence(flags, __memory_order_release, scope);
+        __builtin_amdgcn_s_barrier();
+        __atomic_work_item_fence(flags, __memory_order_acquire, scope);
+    } else {
+        __builtin_amdgcn_s_barrier();
+    }
+}
+
+__device__
+inline
+static void __barrier(int n)
+{
+  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
+}
+
+__device__
+inline
+__attribute__((convergent))
+void __syncthreads()
+{
+  __barrier(__CLK_LOCAL_MEM_FENCE);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_count(int predicate)
+{
+  return __ockl_wgred_add_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_and(int predicate)
+{
+  return __ockl_wgred_and_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_or(int predicate)
+{
+  return __ockl_wgred_or_i32(!!predicate);
+}
+
+// hip.amdgcn.bc - device routine
+/*
+   HW_ID Register bit structure
+   WAVE_ID     3:0     Wave buffer slot number. 0-9.
+   SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+   PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+   CU_ID       11:8    Compute Unit the wave is assigned to.
+   SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+   SE_ID       14:13   Shader Engine the wave is assigned to.
+   TG_ID       19:16   Thread-group ID
+   VM_ID       23:20   Virtual Memory ID
+   QUEUE_ID    26:24   Queue from which this wave was dispatched.
+   STATE_ID    29:27   State ID (graphics only, not compute).
+   ME_ID       31:30   Micro-engine ID.
+ */
+
+#define HW_ID               4
+
+#define HW_ID_CU_ID_SIZE    4
+#define HW_ID_CU_ID_OFFSET  8
+
+#define HW_ID_SE_ID_SIZE    2
+#define HW_ID_SE_ID_OFFSET  13
+
+/*
+   Encoding of parameter bitmask
+   HW_ID        5:0     HW_ID
+   OFFSET       10:6    Range: 0..31
+   SIZE         15:11   Range: 1..32
+ */
+
+#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
+
+/*
+  __smid returns the wave's assigned Compute Unit and Shader Engine.
+  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
+  Note: the results vary over time.
+  SZ minus 1 since SIZE is 1-based.
+*/
+__device__
+inline
+unsigned __smid(void)
+{
+    unsigned cu_id = __builtin_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
+    unsigned se_id = __builtin_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
+
+    /* Each shader engine has 16 CU */
+    return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}
+
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#endif //defined(__clang__) && defined(__HIP__)
+
+
+// loop unrolling
+static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
+    auto dstPtr = static_cast<unsigned char*>(dst);
+    auto srcPtr = static_cast<const unsigned char*>(src);
+
+    while (size >= 4u) {
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+
+        size -= 4u;
+        srcPtr += 4u;
+        dstPtr += 4u;
+    }
+    switch (size) {
+        case 3:
+            dstPtr[2] = srcPtr[2];
+        case 2:
+            dstPtr[1] = srcPtr[1];
+        case 1:
+            dstPtr[0] = srcPtr[0];
+    }
+
+    return dst;
+}
+
+static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
+    auto dstPtr = static_cast<unsigned char*>(dst);
+
+    while (size >= 4u) {
+        dstPtr[0] = val;
+        dstPtr[1] = val;
+        dstPtr[2] = val;
+        dstPtr[3] = val;
+
+        size -= 4u;
+        dstPtr += 4u;
+    }
+    switch (size) {
+        case 3:
+            dstPtr[2] = val;
+        case 2:
+            dstPtr[1] = val;
+        case 1:
+            dstPtr[0] = val;
+    }
+
+    return dst;
+}
+#ifndef __OPENMP_AMDGCN__
+static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+    return __hip_hc_memcpy(dst, src, size);
+}
+
+static inline __device__ void* memset(void* ptr, int val, size_t size) {
+    unsigned char val8 = static_cast<unsigned char>(val);
+    return __hip_hc_memset(ptr, val8, size);
+}
+#endif // !__OPENMP_AMDGCN__
+#endif
diff --git a/include/hip/amd_detail/device_library_decls.h b/include/hip/amd_detail/device_library_decls.h
new file mode 100644
index 0000000000..bfb922ff8c
--- /dev/null
+++ b/include/hip/amd_detail/device_library_decls.h
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/device_library_decls.h
+ *  @brief Contains declarations for types and functions in device library.
+ *         Uses int64_t and uint64_t instead of long, long long, unsigned
+ *         long and unsigned long long types for device library API
+ *         declarations.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+
+#include "hip/amd_detail/host_defines.h"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned long long ullong;
+
+extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
+extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
+extern "C" __device__ uint __ockl_activelane_u32(void);
+
+extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
+
+extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
+extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
+extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
+extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
+
+extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
+
+extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
+extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+
+extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
+
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
+
+
+// Introduce local address space
+#define __local __attribute__((address_space(3)))
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
+#endif //__HIP_DEVICE_COMPILE__
+
+// Using hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE    0x01
+typedef unsigned __cl_mem_fence_flags;
+
+typedef enum __memory_scope {
+  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} __memory_scope;
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum __memory_order
+{
+  __memory_order_relaxed = __ATOMIC_RELAXED,
+  __memory_order_acquire = __ATOMIC_ACQUIRE,
+  __memory_order_release = __ATOMIC_RELEASE,
+  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  __memory_order_seq_cst = __ATOMIC_SEQ_CST
+} __memory_order;
+
+// Linked from hip.amdgcn.bc
+extern "C" __device__ void
+__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope);
+
+#endif
diff --git a/include/hip/amd_detail/driver_types.h b/include/hip/amd_detail/driver_types.h
new file mode 100644
index 0000000000..fe29d1f144
--- /dev/null
+++ b/include/hip/amd_detail/driver_types.h
@@ -0,0 +1,478 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DRIVER_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DRIVER_TYPES_H
+
+// The follow macro should be removed after upstream updation.
+// It's defined here for workarround of rocThrust building failure.
+#define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
+
+#if !defined(__HIPCC_RTC__)
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+#endif // !defined(__HIPCC_RTC__)
+
+typedef void* hipDeviceptr_t;
+typedef enum hipChannelFormatKind {
+    hipChannelFormatKindSigned = 0,
+    hipChannelFormatKindUnsigned = 1,
+    hipChannelFormatKindFloat = 2,
+    hipChannelFormatKindNone = 3
+}hipChannelFormatKind;
+
+typedef struct hipChannelFormatDesc {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum hipChannelFormatKind f;
+}hipChannelFormatDesc;
+
+#define HIP_TRSA_OVERRIDE_FORMAT 0x01
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+#define HIP_TRSF_SRGB 0x10
+
+typedef enum hipArray_Format {
+    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+    HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+    HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+    HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+    HIP_AD_FORMAT_HALF = 0x10,
+    HIP_AD_FORMAT_FLOAT = 0x20
+}hipArray_Format;
+
+typedef struct HIP_ARRAY_DESCRIPTOR {
+  size_t Width;
+  size_t Height;
+  enum hipArray_Format Format;
+  unsigned int NumChannels;
+}HIP_ARRAY_DESCRIPTOR;
+
+typedef struct HIP_ARRAY3D_DESCRIPTOR {
+  size_t Width;
+  size_t Height;
+  size_t Depth;
+  enum hipArray_Format Format;
+  unsigned int NumChannels;
+  unsigned int Flags;
+}HIP_ARRAY3D_DESCRIPTOR;
+
+typedef struct hipArray {
+    void* data;  // FIXME: generalize this
+    struct hipChannelFormatDesc desc;
+    unsigned int type;
+    unsigned int width;
+    unsigned int height;
+    unsigned int depth;
+    enum hipArray_Format Format;
+    unsigned int NumChannels;
+    bool isDrv;
+    unsigned int textureType;
+}hipArray;
+
+#if !defined(__HIPCC_RTC__)
+typedef struct hip_Memcpy2D {
+    size_t srcXInBytes;
+    size_t srcY;
+    hipMemoryType srcMemoryType;
+    const void* srcHost;
+    hipDeviceptr_t srcDevice;
+    hipArray* srcArray;
+    size_t srcPitch;
+    size_t dstXInBytes;
+    size_t dstY;
+    hipMemoryType dstMemoryType;
+    void* dstHost;
+    hipDeviceptr_t dstDevice;
+    hipArray* dstArray;
+    size_t dstPitch;
+    size_t WidthInBytes;
+    size_t Height;
+} hip_Memcpy2D;
+#endif // !defined(__HIPCC_RTC__)
+
+typedef struct hipArray* hipArray_t;
+typedef hipArray_t hiparray;
+typedef const struct hipArray* hipArray_const_t;
+
+typedef struct hipMipmappedArray {
+  void* data;
+  struct hipChannelFormatDesc desc;
+  unsigned int type;
+  unsigned int width;
+  unsigned int height;
+  unsigned int depth;
+  unsigned int min_mipmap_level;
+  unsigned int max_mipmap_level;
+  unsigned int flags;
+  enum hipArray_Format format;
+} hipMipmappedArray;
+
+typedef struct hipMipmappedArray* hipMipmappedArray_t;
+
+typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
+
+/**
+ * hip resource types
+ */
+typedef enum hipResourceType {
+    hipResourceTypeArray = 0x00,
+    hipResourceTypeMipmappedArray = 0x01,
+    hipResourceTypeLinear = 0x02,
+    hipResourceTypePitch2D = 0x03
+}hipResourceType;
+
+typedef enum HIPresourcetype_enum {
+    HIP_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    HIP_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    HIP_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} HIPresourcetype;
+
+/**
+ * hip address modes
+ */
+typedef enum HIPaddress_mode_enum {
+    HIP_TR_ADDRESS_MODE_WRAP   = 0,
+    HIP_TR_ADDRESS_MODE_CLAMP  = 1,
+    HIP_TR_ADDRESS_MODE_MIRROR = 2,
+    HIP_TR_ADDRESS_MODE_BORDER = 3
+} HIPaddress_mode;
+
+/**
+ * hip filter modes
+ */
+typedef enum HIPfilter_mode_enum {
+    HIP_TR_FILTER_MODE_POINT  = 0,
+    HIP_TR_FILTER_MODE_LINEAR = 1
+} HIPfilter_mode;
+
+/**
+ * Texture descriptor
+ */
+typedef struct HIP_TEXTURE_DESC_st {
+    HIPaddress_mode addressMode[3];  /**< Address modes */
+    HIPfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;              /**< Flags */
+    unsigned int maxAnisotropy;      /**< Maximum anisotropy ratio */
+    HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;           /**< Mipmap level bias */
+    float minMipmapLevelClamp;       /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;       /**< Mipmap maximum level clamp */
+    float borderColor[4];            /**< Border Color */
+    int reserved[12];
+} HIP_TEXTURE_DESC;
+
+/**
+ * hip texture resource view formats
+ */
+typedef enum hipResourceViewFormat {
+    hipResViewFormatNone = 0x00,
+    hipResViewFormatUnsignedChar1 = 0x01,
+    hipResViewFormatUnsignedChar2 = 0x02,
+    hipResViewFormatUnsignedChar4 = 0x03,
+    hipResViewFormatSignedChar1 = 0x04,
+    hipResViewFormatSignedChar2 = 0x05,
+    hipResViewFormatSignedChar4 = 0x06,
+    hipResViewFormatUnsignedShort1 = 0x07,
+    hipResViewFormatUnsignedShort2 = 0x08,
+    hipResViewFormatUnsignedShort4 = 0x09,
+    hipResViewFormatSignedShort1 = 0x0a,
+    hipResViewFormatSignedShort2 = 0x0b,
+    hipResViewFormatSignedShort4 = 0x0c,
+    hipResViewFormatUnsignedInt1 = 0x0d,
+    hipResViewFormatUnsignedInt2 = 0x0e,
+    hipResViewFormatUnsignedInt4 = 0x0f,
+    hipResViewFormatSignedInt1 = 0x10,
+    hipResViewFormatSignedInt2 = 0x11,
+    hipResViewFormatSignedInt4 = 0x12,
+    hipResViewFormatHalf1 = 0x13,
+    hipResViewFormatHalf2 = 0x14,
+    hipResViewFormatHalf4 = 0x15,
+    hipResViewFormatFloat1 = 0x16,
+    hipResViewFormatFloat2 = 0x17,
+    hipResViewFormatFloat4 = 0x18,
+    hipResViewFormatUnsignedBlockCompressed1 = 0x19,
+    hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
+    hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
+    hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
+    hipResViewFormatSignedBlockCompressed4 = 0x1d,
+    hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
+    hipResViewFormatSignedBlockCompressed5 = 0x1f,
+    hipResViewFormatUnsignedBlockCompressed6H = 0x20,
+    hipResViewFormatSignedBlockCompressed6H = 0x21,
+    hipResViewFormatUnsignedBlockCompressed7 = 0x22
+}hipResourceViewFormat;
+
+typedef enum HIPresourceViewFormat_enum
+{
+    HIP_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    HIP_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} HIPresourceViewFormat;
+
+/**
+ * HIP resource descriptor
+ */
+typedef struct hipResourceDesc {
+    enum hipResourceType resType;
+
+    union {
+        struct {
+            hipArray_t array;
+        } array;
+        struct {
+            hipMipmappedArray_t mipmap;
+        } mipmap;
+        struct {
+            void* devPtr;
+            struct hipChannelFormatDesc desc;
+            size_t sizeInBytes;
+        } linear;
+        struct {
+            void* devPtr;
+            struct hipChannelFormatDesc desc;
+            size_t width;
+            size_t height;
+            size_t pitchInBytes;
+        } pitch2D;
+    } res;
+}hipResourceDesc;
+
+typedef struct HIP_RESOURCE_DESC_st
+{
+    HIPresourcetype resType;                     /**< Resource type */
+
+    union {
+        struct {
+            hipArray_t hArray;                   /**< HIP array */
+        } array;
+        struct {
+            hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
+        } mipmap;
+        struct {
+            hipDeviceptr_t devPtr;               /**< Device pointer */
+            hipArray_Format format;              /**< Array format */
+            unsigned int numChannels;            /**< Channels per array element */
+            size_t sizeInBytes;                  /**< Size in bytes */
+        } linear;
+        struct {
+            hipDeviceptr_t devPtr;               /**< Device pointer */
+            hipArray_Format format;              /**< Array format */
+            unsigned int numChannels;            /**< Channels per array element */
+            size_t width;                        /**< Width of the array in elements */
+            size_t height;                       /**< Height of the array in elements */
+            size_t pitchInBytes;                 /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                          /**< Flags (must be zero) */
+} HIP_RESOURCE_DESC;
+
+/**
+ * hip resource view descriptor
+ */
+struct hipResourceViewDesc {
+    enum hipResourceViewFormat format;
+    size_t width;
+    size_t height;
+    size_t depth;
+    unsigned int firstMipmapLevel;
+    unsigned int lastMipmapLevel;
+    unsigned int firstLayer;
+    unsigned int lastLayer;
+};
+
+/**
+ * Resource view descriptor
+ */
+typedef struct HIP_RESOURCE_VIEW_DESC_st
+{
+    HIPresourceViewFormat format;   /**< Resource view format */
+    size_t width;                   /**< Width of the resource view */
+    size_t height;                  /**< Height of the resource view */
+    size_t depth;                   /**< Depth of the resource view */
+    unsigned int firstMipmapLevel;  /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;   /**< Last defined mipmap level */
+    unsigned int firstLayer;        /**< First layer index */
+    unsigned int lastLayer;         /**< Last layer index */
+    unsigned int reserved[16];
+} HIP_RESOURCE_VIEW_DESC;
+
+/**
+ * Memory copy types
+ *
+ */
+#if !defined(__HIPCC_RTC__)
+typedef enum hipMemcpyKind {
+    hipMemcpyHostToHost = 0,      ///< Host-to-Host Copy
+    hipMemcpyHostToDevice = 1,    ///< Host-to-Device Copy
+    hipMemcpyDeviceToHost = 2,    ///< Device-to-Host Copy
+    hipMemcpyDeviceToDevice = 3,  ///< Device-to-Device Copy
+    hipMemcpyDefault =
+        4  ///< Runtime will automatically determine copy-kind based on virtual addresses.
+} hipMemcpyKind;
+
+typedef struct hipPitchedPtr {
+    void* ptr;
+    size_t pitch;
+    size_t xsize;
+    size_t ysize;
+}hipPitchedPtr;
+
+typedef struct hipExtent {
+    size_t width;  // Width in elements when referring to array memory, in bytes when referring to
+                   // linear memory
+    size_t height;
+    size_t depth;
+}hipExtent;
+
+typedef struct hipPos {
+    size_t x;
+    size_t y;
+    size_t z;
+}hipPos;
+
+typedef struct hipMemcpy3DParms {
+    hipArray_t srcArray;
+    struct hipPos srcPos;
+    struct hipPitchedPtr srcPtr;
+    hipArray_t dstArray;
+    struct hipPos dstPos;
+    struct hipPitchedPtr dstPtr;
+    struct hipExtent extent;
+    enum hipMemcpyKind kind;
+} hipMemcpy3DParms;
+
+typedef struct HIP_MEMCPY3D {
+  unsigned int srcXInBytes;
+  unsigned int srcY;
+  unsigned int srcZ;
+  unsigned int srcLOD;
+  hipMemoryType srcMemoryType;
+  const void* srcHost;
+  hipDeviceptr_t srcDevice;
+  hipArray_t srcArray;
+  unsigned int srcPitch;
+  unsigned int srcHeight;
+  unsigned int dstXInBytes;
+  unsigned int dstY;
+  unsigned int dstZ;
+  unsigned int dstLOD;
+  hipMemoryType dstMemoryType;
+  void* dstHost;
+  hipDeviceptr_t dstDevice;
+  hipArray_t dstArray;
+  unsigned int dstPitch;
+  unsigned int dstHeight;
+  unsigned int WidthInBytes;
+  unsigned int Height;
+  unsigned int Depth;
+} HIP_MEMCPY3D;
+
+static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
+                                                          size_t ysz) {
+    struct hipPitchedPtr s;
+
+    s.ptr = d;
+    s.pitch = p;
+    s.xsize = xsz;
+    s.ysize = ysz;
+
+    return s;
+}
+
+static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+    struct hipPos p;
+
+    p.x = x;
+    p.y = y;
+    p.z = z;
+
+    return p;
+}
+
+static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+    struct hipExtent e;
+
+    e.width = w;
+    e.height = h;
+    e.depth = d;
+
+    return e;
+}
+
+typedef enum hipFunction_attribute {
+    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,
+    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+    HIP_FUNC_ATTRIBUTE_NUM_REGS,
+    HIP_FUNC_ATTRIBUTE_PTX_VERSION,
+    HIP_FUNC_ATTRIBUTE_BINARY_VERSION,
+    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,
+    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
+    HIP_FUNC_ATTRIBUTE_MAX
+}hipFunction_attribute;
+#endif // !defined(__HIPCC_RTC__)
+#endif
diff --git a/include/hip/amd_detail/functional_grid_launch.hpp b/include/hip/amd_detail/functional_grid_launch.hpp
new file mode 100644
index 0000000000..efe6a60197
--- /dev/null
+++ b/include/hip/amd_detail/functional_grid_launch.hpp
@@ -0,0 +1,218 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+#include "program_state.hpp"
+#include "hip_runtime_api.h"
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+                                               unsigned int flags, hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
+                                    dim3 blockDim, void** args,
+                                    size_t sharedMem, hipStream_t stream,
+                                    hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int flags,
+                                                 hip_impl::program_state& ps);
+
+#pragma GCC visibility push(hidden)
+
+namespace hip_impl {
+template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
+inline T round_up_to_next_multiple_nonnegative(T x, T y) {
+    T tmp = x + y - 1;
+    return tmp - tmp % y;
+}
+
+template <
+    std::size_t n,
+    typename... Ts,
+    typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+    const std::tuple<Ts...>&,
+    const kernargs_size_align&,
+    hip_impl::kernarg kernarg) {
+    return kernarg;
+}
+
+template <
+    std::size_t n,
+    typename... Ts,
+    typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+    const std::tuple<Ts...>& formals,
+    const kernargs_size_align& size_align,
+    hip_impl::kernarg kernarg) {
+    using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
+
+    static_assert(
+        !std::is_reference<T>{},
+        "A __global__ function cannot have a reference as one of its "
+            "arguments.");
+    #if defined(HIP_STRICT)
+        static_assert(
+            std::is_trivially_copyable<T>{},
+            "Only TriviallyCopyable types can be arguments to a __global__ "
+                "function");
+    #endif
+
+    kernarg.resize(round_up_to_next_multiple_nonnegative(
+        kernarg.size(), size_align.alignment(n)) + size_align.size(n));
+
+    std::memcpy(
+        kernarg.data() + kernarg.size() - size_align.size(n),
+        &std::get<n>(formals),
+        size_align.size(n));
+    return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
+}
+
+template <typename... Formals, typename... Actuals>
+inline hip_impl::kernarg make_kernarg(
+    void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
+    static_assert(sizeof...(Formals) == sizeof...(Actuals),
+        "The count of formal arguments must match the count of actuals.");
+
+    if (sizeof...(Formals) == 0) return {};
+
+    std::tuple<Formals...> to_formals{std::move(actuals)};
+    hip_impl::kernarg kernarg;
+    kernarg.reserve(sizeof(to_formals));
+
+    auto& ps = hip_impl::get_program_state();
+    return make_kernarg<0>(to_formals, 
+                           ps.get_kernargs_size_align(
+                               reinterpret_cast<std::uintptr_t>(kernel)),
+                           std::move(kernarg));
+}
+
+
+HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
+
+inline
+__attribute__((visibility("hidden")))
+void hipLaunchKernelGGLImpl(
+    std::uintptr_t function_address,
+    const dim3& numBlocks,
+    const dim3& dimBlocks,
+    std::uint32_t sharedMemBytes,
+    hipStream_t stream,
+    void** kernarg) {
+
+    const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address, 
+                                                               target_agent(stream));
+
+    hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
+                          dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
+                          stream, nullptr, kernarg);
+}
+} // Namespace hip_impl.
+
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+
+    using namespace hip_impl;
+
+    hip_impl::hip_init();
+    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+                                                   target_agent(0));
+
+    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+                                      dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int  flags = 0 ) {
+
+    using namespace hip_impl;
+
+    hip_impl::hip_init();
+    if(flags != hipOccupancyDefault) return hipErrorNotSupported;
+    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+                                                   target_agent(0));
+
+    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+                                      dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <typename... Args, typename F = void (*)(Args...)>
+inline
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, hipStream_t stream,
+                        Args... args) {
+    hip_impl::hip_init();
+    auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
+    std::size_t kernarg_size = kernarg.size();
+
+    void* config[]{
+        HIP_LAUNCH_PARAM_BUFFER_POINTER,
+        kernarg.data(),
+        HIP_LAUNCH_PARAM_BUFFER_SIZE,
+        &kernarg_size,
+        HIP_LAUNCH_PARAM_END};
+
+    hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
+                                     numBlocks, dimBlocks, sharedMemBytes,
+                                     stream, &config[0]);
+}
+
+template <typename F>
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
+                                      void** args, size_t sharedMem,
+                                      hipStream_t stream) {
+    hip_impl::hip_init();
+    auto& ps = hip_impl::get_program_state();
+    return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
+                                      blockDim, args, sharedMem, stream, ps);
+}
+
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int  flags) {
+
+    hip_impl::hip_init();
+    auto& ps = hip_impl::get_program_state();
+    return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
+}
+
+#pragma GCC visibility pop
diff --git a/include/hip/amd_detail/grid_launch.h b/include/hip/amd_detail/grid_launch.h
new file mode 100644
index 0000000000..22841a5657
--- /dev/null
+++ b/include/hip/amd_detail/grid_launch.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <hc_defines.h>
+
+#define GRID_LAUNCH_VERSION 20
+
+// Extern definitions
+namespace hc{
+class completion_future;
+class accelerator_view;
+}
+
+
+// 3 dim structure for groups and grids.
+typedef struct gl_dim3
+{
+  int x,y,z;
+  gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
+} gl_dim3;
+
+typedef enum gl_barrier_bit {
+    barrier_bit_queue_default,
+    barrier_bit_none,
+    barrier_bit_wait,
+} gl_barrier_bit;
+
+
+// grid_launch_parm contains information used to launch the kernel.
+typedef struct grid_launch_parm
+{
+  //! Grid dimensions
+  gl_dim3      grid_dim;
+
+  //! Group dimensions
+  gl_dim3      group_dim;
+
+  //! Amount of dynamic group memory to use with the kernel launch.
+  //! This memory is in addition to the amount used statically in the kernel.
+  unsigned int  dynamic_group_mem_bytes;
+
+  //! Control setting of barrier bit on per-packet basis:
+  //! See gl_barrier_bit description.  
+  //! Placeholder, is not used to control packet dispatch yet
+  enum gl_barrier_bit barrier_bit;
+
+  //! Value of packet fences to apply to launch.
+  //! The correspond to the value of bits 9:14 in the AQL packet,
+  //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
+  unsigned int  launch_fence;
+
+  //! Pointer to the accelerator_view where the kernel should execute.
+  //! If NULL, the default view on the default accelerator is used.
+  hc::accelerator_view  *av;
+
+  //! Pointer to the completion_future used to track the status of the command.
+  //! If NULL, the command does not write status.  In this case, 
+  //! synchronization can be enforced with queue-level waits or 
+  //! waiting on younger commands.
+  hc::completion_future *cf;
+
+  grid_launch_parm() = default;
+} grid_launch_parm;
+
+
+extern void init_grid_launch(grid_launch_parm *gl);
diff --git a/include/hip/amd_detail/grid_launch.hpp b/include/hip/amd_detail/grid_launch.hpp
new file mode 100644
index 0000000000..04ce7e0366
--- /dev/null
+++ b/include/hip/amd_detail/grid_launch.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "grid_launch.h"
+#include "hc.hpp"
+
+class grid_launch_parm_cxx : public grid_launch_parm
+{
+public:
+  grid_launch_parm_cxx() = default;
+
+  // customized serialization: don't need av and cf in kernel
+  __attribute__((annotate("serialize")))
+  void __cxxamp_serialize(Kalmar::Serialize& s) const {
+    s.Append(sizeof(int), &grid_dim.x);
+    s.Append(sizeof(int), &grid_dim.y);
+    s.Append(sizeof(int), &grid_dim.z);
+    s.Append(sizeof(int), &group_dim.x);
+    s.Append(sizeof(int), &group_dim.y);
+    s.Append(sizeof(int), &group_dim.z);
+  }
+
+  __attribute__((annotate("user_deserialize")))
+  grid_launch_parm_cxx(int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
+                   int group_dim_x, int group_dim_y, int group_dim_z) {
+    grid_dim.x  = grid_dim_x;
+    grid_dim.y  = grid_dim_y;
+    grid_dim.z  = grid_dim_z;
+    group_dim.x = group_dim_x;
+    group_dim.y = group_dim_y;
+    group_dim.z = group_dim_z;
+  }
+};
+
+
+extern inline void grid_launch_init(grid_launch_parm *lp) {
+  lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
+
+  lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
+
+  lp->dynamic_group_mem_bytes = 0;
+
+  lp->barrier_bit = barrier_bit_queue_default;
+  lp->launch_fence = -1;
+
+  // TODO - set to NULL?
+  static hc::accelerator_view av = hc::accelerator().get_default_view();
+  lp->av = &av;
+  lp->cf = NULL;
+}
+
diff --git a/tests/src/runtimeApi/module/managed_kernel.cpp b/include/hip/amd_detail/grid_launch_GGL.hpp
similarity index 84%
rename from tests/src/runtimeApi/module/managed_kernel.cpp
rename to include/hip/amd_detail/grid_launch_GGL.hpp
index 7c37713c9d..fbae198af1 100644
--- a/tests/src/runtimeApi/module/managed_kernel.cpp
+++ b/include/hip/amd_detail/grid_launch_GGL.hpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021-present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,9 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include "hip/hip_runtime.h"
-__managed__ int x = 10;
+#pragma once
 
-extern "C" __global__ void GPU_func() {
-  x++;
-}
+#if GENERIC_GRID_LAUNCH == 1
+#include "macro_based_grid_launch.hpp"
+#endif  // GENERIC_GRID_LAUNCH
\ No newline at end of file
diff --git a/include/hip/amd_detail/helpers.hpp b/include/hip/amd_detail/helpers.hpp
new file mode 100644
index 0000000000..b94b126994
--- /dev/null
+++ b/include/hip/amd_detail/helpers.hpp
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "concepts.hpp"
+
+#include <type_traits>  // For std::conditional, std::decay, std::enable_if,
+                        // std::false_type, std result_of and std::true_type.
+#include <utility>      // For std::declval.
+
+#ifdef __has_include                      // Check if __has_include is present
+#  if __has_include(<version>)            // Check for version header
+#    include <version>
+#    if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
+#       define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
+#    endif
+#    if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
+#       define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
+#    endif
+#  endif
+#endif
+
+#ifndef HIP_HAS_INVOCABLE
+#define HIP_HAS_INVOCABLE 0
+#endif
+
+#ifndef HIP_HAS_RESULT_OF_SFINAE
+#define HIP_HAS_RESULT_OF_SFINAE 0
+#endif
+
+namespace std {  // TODO: these should be removed as soon as possible.
+#if (__cplusplus < 201406L)
+#if (__cplusplus < 201402L)
+template <bool cond, typename T = void>
+using enable_if_t = typename enable_if<cond, T>::type;
+template <bool cond, typename T, typename U>
+using conditional_t = typename conditional<cond, T, U>::type;
+template <typename T>
+using decay_t = typename decay<T>::type;
+template <FunctionalProcedure F, typename... Ts>
+using result_of_t = typename result_of<F(Ts...)>::type;
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+#endif
+#endif
+}  // namespace std
+
+namespace hip_impl {
+template <typename...>
+using void_t_ = void;
+
+#if HIP_HAS_INVOCABLE
+template <typename, typename = void>
+struct is_callable_impl;
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
+#elif HIP_HAS_RESULT_OF_SFINAE
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
+#else
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, Derived&& ref)
+-> decltype(static_cast<Derived&&>(ref).*pmd);
+ 
+template <class PMD, class Pointer>
+auto simple_invoke(PMD&& pmd, Pointer&& ptr)
+-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
+
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, const std::reference_wrapper<Derived>& ref)
+-> decltype(ref.get().*pmd);
+ 
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)
+-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
+ 
+template <class PMF, class Pointer, class... Args>
+auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
+-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
+
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
+-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
+
+template<class F, class... Ts>
+auto simple_invoke(F&& f, Ts&&... xs) 
+-> decltype(f(static_cast<Ts&&>(xs)...));
+
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
+    : std::true_type {};
+
+#endif
+
+template <typename Call>
+struct is_callable : is_callable_impl<Call> {};
+
+#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,     \
+                                   _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25,     \
+                                   _26, _27, _28, _29, _30, _31, _n, ...)                          \
+    _n
+#define count_macro_args_hip_(...)                                                                 \
+    count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20,    \
+                               19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
+                               0)
+
+#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
+#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
+#define overload_macro_hip_(macro, ...)                                                            \
+    overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
+}  // namespace hip_impl
diff --git a/include/hip/amd_detail/hip_atomic.h b/include/hip/amd_detail/hip_atomic.h
new file mode 100644
index 0000000000..0c4bc80cf6
--- /dev/null
+++ b/include/hip/amd_detail/hip_atomic.h
@@ -0,0 +1,691 @@
+
+
+#include "device_functions.h"
+
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+
+#if !__HIP_DEVICE_COMPILE__
+//TODO: Remove this after compiler pre-defines the following Macros.
+#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
+#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
+#define __HIP_MEMORY_SCOPE_WORKGROUP 3
+#define __HIP_MEMORY_SCOPE_AGENT 4
+#define __HIP_MEMORY_SCOPE_SYSTEM 5
+#endif
+
+__device__
+inline
+int atomicCAS(int* address, int compare, int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+    return compare;
+}
+
+__device__
+inline
+int atomicCAS_system(int* address, int compare, int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+    return compare;
+}
+
+__device__
+inline
+unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__
+inline
+unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  return compare;
+}
+
+__device__
+inline
+unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
+                             unsigned long long val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_AGENT);
+  return compare;
+}
+
+__device__
+inline
+unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
+                                    unsigned long long val) {
+  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                       __HIP_MEMORY_SCOPE_SYSTEM);
+  return compare;
+}
+
+__device__
+inline
+int atomicAdd(int* address, int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicAdd_system(int* address, int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicAdd(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+float atomicAdd(float* address, float val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+float atomicAdd_system(float* address, float val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+#if !defined(__HIPCC_RTC__)
+DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
+__device__
+inline
+void atomicAddNoRet(float* address, float val)
+{
+    __ockl_atomic_add_noret_f32(address, val);
+}
+
+__device__
+inline
+double atomicAdd(double* address, double val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+double atomicAdd_system(double* address, double val) {
+  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicSub(int* address, int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicSub_system(int* address, int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicSub(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicExch(int* address, int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicExch_system(int* address, int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicExch(unsigned int* address, unsigned int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+float atomicExch(float* address, float val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+float atomicExch_system(float* address, float val) {
+  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicMin(int* address, int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicMin_system(int* address, int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicMin(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicMax(int* address, int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicMax_system(int* address, int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicMax(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicInc(unsigned int* address, unsigned int val)
+{
+    __device__
+    extern
+    unsigned int __builtin_amdgcn_atomic_inc(
+        unsigned int*,
+        unsigned int,
+        unsigned int,
+        unsigned int,
+        bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
+
+    return __builtin_amdgcn_atomic_inc(
+        address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+unsigned int atomicDec(unsigned int* address, unsigned int val)
+{
+    __device__
+    extern
+    unsigned int __builtin_amdgcn_atomic_dec(
+        unsigned int*,
+        unsigned int,
+        unsigned int,
+        unsigned int,
+        bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
+
+    return __builtin_amdgcn_atomic_dec(
+        address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+int atomicAnd(int* address, int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicAnd_system(int* address, int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicAnd(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+__device__
+inline
+unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicOr(int* address, int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicOr_system(int* address, int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicOr(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+int atomicXor(int* address, int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+int atomicXor_system(int* address, int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned int atomicXor(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+__device__
+inline
+unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+}
+
+__device__
+inline
+unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
+  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+}
+
+#else
+
+__device__
+inline
+int atomicCAS(int* address, int compare, int val)
+{
+    __atomic_compare_exchange_n(
+        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+    return compare;
+}
+__device__
+inline
+unsigned int atomicCAS(
+    unsigned int* address, unsigned int compare, unsigned int val)
+{
+    __atomic_compare_exchange_n(
+        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+    return compare;
+}
+__device__
+inline
+unsigned long long atomicCAS(
+    unsigned long long* address,
+    unsigned long long compare,
+    unsigned long long val)
+{
+    __atomic_compare_exchange_n(
+        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+    return compare;
+}
+
+__device__
+inline
+int atomicAdd(int* address, int val)
+{
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicAdd(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicAdd(
+    unsigned long long* address, unsigned long long val)
+{
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+float atomicAdd(float* address, float val)
+{
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+
+#if !defined(__HIPCC_RTC__)
+DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
+__device__
+inline
+void atomicAddNoRet(float* address, float val)
+{
+    __ockl_atomic_add_noret_f32(address, val);
+}
+
+__device__
+inline
+double atomicAdd(double* address, double val)
+{
+    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicSub(int* address, int val)
+{
+    return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicSub(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicExch(int* address, int val)
+{
+    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicExch(unsigned int* address, unsigned int val)
+{
+    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
+{
+    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+float atomicExch(float* address, float val)
+{
+    return __uint_as_float(__atomic_exchange_n(
+        reinterpret_cast<unsigned int*>(address),
+        __float_as_uint(val),
+        __ATOMIC_RELAXED));
+}
+
+__device__
+inline
+int atomicMin(int* address, int val)
+{
+    return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicMin(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicMin(
+    unsigned long long* address, unsigned long long val)
+{
+    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
+    while (val < tmp) {
+        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+        if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+        tmp = atomicCAS(address, tmp, val);
+    }
+
+    return tmp;
+}
+
+__device__
+inline
+int atomicMax(int* address, int val)
+{
+    return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicMax(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicMax(
+    unsigned long long* address, unsigned long long val)
+{
+    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
+    while (tmp < val) {
+        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+        if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+        tmp = atomicCAS(address, tmp, val);
+    }
+
+    return tmp;
+}
+
+__device__
+inline
+unsigned int atomicInc(unsigned int* address, unsigned int val)
+{
+    __device__
+    extern
+    unsigned int __builtin_amdgcn_atomic_inc(
+        unsigned int*,
+        unsigned int,
+        unsigned int,
+        unsigned int,
+        bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
+
+    return __builtin_amdgcn_atomic_inc(
+        address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+unsigned int atomicDec(unsigned int* address, unsigned int val)
+{
+    __device__
+    extern
+    unsigned int __builtin_amdgcn_atomic_dec(
+        unsigned int*,
+        unsigned int,
+        unsigned int,
+        unsigned int,
+        bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
+
+    return __builtin_amdgcn_atomic_dec(
+        address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+int atomicAnd(int* address, int val)
+{
+    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicAnd(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicAnd(
+    unsigned long long* address, unsigned long long val)
+{
+    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicOr(int* address, int val)
+{
+    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicOr(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicOr(
+    unsigned long long* address, unsigned long long val)
+{
+    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicXor(int* address, int val)
+{
+    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicXor(unsigned int* address, unsigned int val)
+{
+    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicXor(
+    unsigned long long* address, unsigned long long val)
+{
+    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+
+#endif
diff --git a/include/hip/amd_detail/hip_common.h b/include/hip/amd_detail/hip_common.h
new file mode 100644
index 0000000000..4881ade678
--- /dev/null
+++ b/include/hip/amd_detail/hip_common.h
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+
+#if defined(__clang__) && defined(__HIP__)
+#define __HIP_CLANG_ONLY__ 1
+#else
+#define __HIP_CLANG_ONLY__ 0
+#endif
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
diff --git a/include/hip/amd_detail/hip_complex.h b/include/hip/amd_detail/hip_complex.h
new file mode 100644
index 0000000000..db312780c1
--- /dev/null
+++ b/include/hip/amd_detail/hip_complex.h
@@ -0,0 +1,309 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
+
+#include "hip/amd_detail/hip_vector_types.h"
+
+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
+// TODO: Clang has a bug which allows device functions to call std functions
+// when std functions are introduced into default namespace by using statement.
+// math.h may be included after this bug is fixed.
+#if __cplusplus
+#include <cmath>
+#else
+#include "math.h"
+#endif
+#endif // !defined(__HIPCC_RTC__)
+
+#if __cplusplus
+#define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator-(const type& op) {                             \
+        type ret;                                                                                  \
+        ret.x = -op.x;                                                                             \
+        ret.y = -op.y;                                                                             \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
+    __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+        return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
+    }
+
+#define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
+    __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+        return !(lhs == rhs);                                                                      \
+    }
+
+#define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x + rhs.x;                                                                     \
+        ret.y = lhs.y + rhs.y;                                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x - rhs.x;                                                                     \
+        ret.y = lhs.y - rhs.y;                                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
+        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
+        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
+        ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+        lhs.x += rhs.x;                                                                            \
+        lhs.y += rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+        lhs.x -= rhs.x;                                                                            \
+        lhs.y -= rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+        lhs = lhs * rhs;                                                                           \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+        lhs = lhs / rhs;                                                                           \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs;                                                                       \
+        ret.y = lhs.y * rhs;                                                                       \
+        return ret;                                                                                \
+    }
+
+#endif
+
+typedef float2 hipFloatComplex;
+
+__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+
+__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+
+__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    hipFloatComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+    hipFloatComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return z.x * z.x + z.y * z.y;
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    float sqabs = hipCsqabsf(q);
+    hipFloatComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+
+
+typedef double2 hipDoubleComplex;
+
+__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+
+__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+
+__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    hipDoubleComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+    hipDoubleComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return z.x * z.x + z.y * z.y;
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    double sqabs = hipCsqabs(q);
+    hipDoubleComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
+
+
+#if __cplusplus
+
+COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
+
+COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
+
+#endif
+
+
+typedef hipFloatComplex hipComplex;
+
+__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
+    return make_hipFloatComplex(x, y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipComplex(real, imag);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    double real = (p.x * q.x) + r.x;
+    double imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipDoubleComplex(real, imag);
+}
+
+#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
diff --git a/include/hip/amd_detail/hip_cooperative_groups.h b/include/hip/amd_detail/hip_cooperative_groups.h
new file mode 100644
index 0000000000..cd3e4bf085
--- /dev/null
+++ b/include/hip/amd_detail/hip_cooperative_groups.h
@@ -0,0 +1,510 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+#if __cplusplus
+#include <hip/amd_detail/hip_cooperative_groups_helper.h>
+
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group types
+ *           object, like the group type, its size, etc
+ */
+class thread_group {
+ protected:
+  uint32_t _type;  // thread_group type
+  uint32_t _size;  // total number of threads in the tread_group
+  uint64_t _mask;  // Lanemask for coalesced and tiled partitioned group types,
+                   // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+  struct _tiled_info {
+    bool is_tiled;
+    unsigned int size;
+  } tiled_info;
+
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend class thread_block;
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const { return _size; }
+  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+  // Construct mutli-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+      : thread_group(internal::cg_multi_grid, size) {}
+
+ public:
+  // Number of invocations participating in this multi-grid group. In other
+  // words, the number of GPUs
+  __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
+  // Rank of this invocation. In other words, an ID number within the range
+  // [0, num_grids()) of the GPU, this kernel is running on
+  __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
+  return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+  // Construct grid thread group (through the API this_grid())
+  explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
+
+ public:
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
+
+/** \brief   The workgroup (thread-block in CUDA terminology) cooperative group
+ *           type
+ *
+ *  \details Represents an intra-workgroup cooperative group type where the
+ *           participating threads within the group are exactly the same threads
+ *           which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+  // Only these friend functions are allowed to construct an object of thi
+  // class and access its resources
+  friend __CG_QUALIFIER__ thread_block this_thread_block();
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
+                                                       unsigned int tile_size);
+
+ protected:
+  // Construct a workgroup thread group (through the API this_thread_block())
+  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+      : thread_group(internal::cg_workgroup, size) {}
+
+  __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
+    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
+    // Invalid tile size, assert
+    if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
+      assert(false && "invalid tile size");
+    }
+
+    thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
+    tiledGroup.tiled_info.size = tile_size;
+    tiledGroup.tiled_info.is_tiled = true;
+    return tiledGroup;
+  }
+
+ public:
+  // 3-dimensional block index within the grid
+  __CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
+  // 3-dimensional thread index within the block
+  __CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
+};
+
+/** \brief   User exposed API interface to construct workgroup cooperative
+ *           group type object - `thread_block`.
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `thread_block`. Instead, he should construct it through this API
+ *           function.
+ */
+__CG_QUALIFIER__ thread_block this_thread_block() {
+  return thread_block(internal::workgroup::size());
+}
+
+/** \brief   The tiled_group cooperative group type
+ *
+ *  \details Represents one tiled thread group in a wavefront.
+ *           This group type also supports sub-wave level intrinsics.
+ */
+
+class tiled_group : public thread_group {
+ private:
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
+                                                      unsigned int tile_size);
+
+  __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
+    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
+
+    if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
+      assert(false && "invalid tile size");
+    }
+
+    if (size() <= tile_size) {
+      return (*this);
+    }
+
+    tiled_group tiledGroup = tiled_group(tile_size);
+    tiledGroup.tiled_info.is_tiled = true;
+    return tiledGroup;
+  }
+
+ protected:
+  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
+      : thread_group(internal::cg_tiled_group, tileSize) {
+    tiled_info.size = tileSize;
+    tiled_info.is_tiled = true;
+  }
+
+ public:
+  __CG_QUALIFIER__ unsigned int size() const { return (tiled_info.size); }
+
+  __CG_QUALIFIER__ unsigned int thread_rank() const {
+    return (internal::workgroup::thread_rank() & (tiled_info.size - 1));
+  }
+
+  __CG_QUALIFIER__ void sync() const {
+    // enforce memory ordering for memory instructions.
+    __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+  }
+};
+
+/**
+ *  Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->thread_rank());
+    }
+    case internal::cg_tiled_group: {
+      return (static_cast<const tiled_group*>(this)->thread_rank());
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return -1;
+    }
+  }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->is_valid());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->is_valid());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->is_valid());
+    }
+    case internal::cg_tiled_group: {
+      return (static_cast<const tiled_group*>(this)->is_valid());
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return false;
+    }
+  }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      static_cast<const multi_grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_grid: {
+      static_cast<const grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_workgroup: {
+      static_cast<const thread_block*>(this)->sync();
+      break;
+    }
+    case internal::cg_tiled_group: {
+      static_cast<const tiled_group*>(this)->sync();
+      break;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+    }
+  }
+}
+
+/**
+ *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
+ *  group type APIs
+ */
+template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
+
+template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
+  return g.thread_rank();
+}
+
+template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
+
+template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
+
+template <unsigned int tileSize> class tile_base {
+ protected:
+  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
+
+ public:
+  // Rank of the thread within this tile
+  _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
+    return (internal::workgroup::thread_rank() & (numThreads - 1));
+  }
+
+  // Number of threads within this tile
+  __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
+};
+
+template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
+  static_assert(is_valid_tile_size<size>::value,
+                "Tile size is either not a power of 2 or greater than the wavefront size");
+  using tile_base<size>::numThreads;
+
+ public:
+  __CG_STATIC_QUALIFIER__ void sync() {
+    // enforce ordering for memory instructions
+    __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl(var, srcRank, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_down(var, lane_delta, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_up(var, lane_delta, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_xor(var, laneMask, numThreads));
+  }
+};
+
+/** \brief   Group type - thread_block_tile
+ *
+ *  \details  Represents one tile of thread group.
+ */
+
+template <unsigned int tileSize, class ParentCGTy = void>
+class thread_block_tile_type : public thread_block_tile_base<tileSize>, public tiled_group {
+  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
+
+  friend class thread_block_tile_type<tileSize, ParentCGTy>;
+
+  typedef thread_block_tile_base<numThreads> tbtBase;
+
+ protected:
+  __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
+    tiled_info.size = numThreads;
+    tiled_info.is_tiled = true;
+  }
+
+ public:
+  using tbtBase::size;
+  using tbtBase::sync;
+  using tbtBase::thread_rank;
+};
+
+
+/** \brief   User exposed API to partition groups.
+ *
+ *  \details A collective operation that partitions the parent group into a one-dimensional,
+ *           row-major, tiling of subgroups.
+ */
+
+__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
+  if (parent.cg_type() == internal::cg_tiled_group) {
+    const tiled_group* cg = static_cast<const tiled_group*>(&parent);
+    return cg->new_tiled_group(tile_size);
+  } else {
+    const thread_block* tb = static_cast<const thread_block*>(&parent);
+    return tb->new_tiled_group(tile_size);
+  }
+}
+
+// Thread block type overload
+__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
+  return (parent.new_tiled_group(tile_size));
+}
+
+// Coalesced group type overload
+__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
+  return (parent.new_tiled_group(tile_size));
+}
+
+template <unsigned int size, class ParentCGTy> class thread_block_tile;
+
+namespace impl {
+template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
+
+template <unsigned int size, class ParentCGTy>
+class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
+ protected:
+  template <unsigned int tbtSize, class tbtParentT>
+  __CG_QUALIFIER__ thread_block_tile_internal(
+      const thread_block_tile_internal<tbtSize, tbtParentT>& g)
+      : thread_block_tile_type<size, ParentCGTy>() {}
+
+  __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
+      : thread_block_tile_type<size, ParentCGTy>() {}
+};
+}  // namespace impl
+
+template <unsigned int size, class ParentCGTy>
+class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
+ protected:
+  __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
+      : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
+
+ public:
+  __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
+    return thread_block_tile<size, void>(*this);
+  }
+};
+
+
+template <unsigned int size>
+class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
+  template <unsigned int, class ParentCGTy> friend class thread_block_tile;
+
+ protected:
+ public:
+  template <class ParentCGTy>
+  __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
+      : impl::thread_block_tile_internal<size, void>(g) {}
+};
+
+template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
+
+namespace impl {
+template <unsigned int size, class ParentCGTy = void> struct tiled_partition_internal;
+
+template <unsigned int size>
+struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
+  __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
+      : thread_block_tile<size, thread_block>(g) {}
+};
+
+}  // namespace impl
+
+/** \brief   User exposed API to partition groups.
+ *
+ *  \details  This constructs a templated class derieved from thread_group.
+ *            The template defines tile size of the new thread group at compile time.
+ */
+template <unsigned int size, class ParentCGTy>
+__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
+  static_assert(is_valid_tile_size<size>::value,
+                "Tiled partition with size > wavefront size. Currently not supported ");
+  return impl::tiled_partition_internal<size, ParentCGTy>(g);
+}
+}  // namespace cooperative_groups
+
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/include/hip/amd_detail/hip_cooperative_groups_helper.h b/include/hip/amd_detail/hip_cooperative_groups_helper.h
new file mode 100644
index 0000000000..90463485b6
--- /dev/null
+++ b/include/hip/amd_detail/hip_cooperative_groups_helper.h
@@ -0,0 +1,180 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `amd_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#include <hip/amd_detail/hip_runtime_api.h>
+#include <hip/amd_detail/device_functions.h>
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(_CG_STATIC_CONST_DECL_)
+#define _CG_STATIC_CONST_DECL_ static constexpr
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
+#define WAVEFRONT_SIZE 32
+#else
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+/* Global scope */
+template <unsigned int size>
+using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
+
+template <unsigned int size>
+using is_valid_wavefront = std::integral_constant<bool, (size <= WAVEFRONT_SIZE)>;
+
+template <unsigned int size>
+using is_valid_tile_size =
+    std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
+
+template <typename T>
+using is_valid_type =
+    std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+  cg_invalid,
+  cg_multi_grid,
+  cg_grid,
+  cg_workgroup,
+  cg_tiled_group
+} group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); }
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
+
+}  // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)((hipBlockDim_z * hipGridDim_z) * (hipBlockDim_y * hipGridDim_y) *
+                    (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  // Compute global id of the workgroup to which the current thread belongs to
+  uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+                               (hipBlockIdx_y * hipGridDim_x) + (hipBlockIdx_x));
+
+  // Compute total number of threads being passed to reach current workgroup
+  // within grid
+  uint32_t num_threads_till_current_workgroup =
+      (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+
+  // Compute thread local rank within current workgroup
+  uint32_t local_thread_rank = (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                                          (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x));
+
+  return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
+
+}  // namespace grid
+
+/**
+ *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ *  cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index() {
+  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, (uint32_t)hipBlockIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index() {
+  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, (uint32_t)hipThreadIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return ((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                     (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  // TODO(mahesha) any functionality need to be added here? I believe not
+  return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
+
+}  // namespace workgroup
+
+}  // namespace internal
+
+}  // namespace cooperative_groups
+
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#endif
diff --git a/include/hip/amd_detail/hip_fp16.h b/include/hip/amd_detail/hip_fp16.h
new file mode 100644
index 0000000000..fb344aa7d5
--- /dev/null
+++ b/include/hip/amd_detail/hip_fp16.h
@@ -0,0 +1,1662 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
+
+#include <hip/amd_detail/hip_common.h>
+#include "hip/amd_detail/host_defines.h"
+#if defined(__HIPCC_RTC__)
+  #define __HOST_DEVICE__ __device__
+#else
+  #define __HOST_DEVICE__ __host__ __device__
+  #include <assert.h>
+  #if defined(__cplusplus)
+    #include <algorithm>
+    #include <type_traits>
+    #include <utility>
+#endif
+#endif // !defined(__HIPCC_RTC__)
+
+#if __HIP_CLANG_ONLY__
+    typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
+
+    struct __half_raw {
+        union {
+            static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
+
+            _Float16 data;
+            unsigned short x;
+        };
+    };
+
+    struct __half2_raw {
+        union {
+            static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
+
+            _Float16_2 data;
+            struct {
+                unsigned short x;
+                unsigned short y;
+            };
+        };
+    };
+
+    #if defined(__cplusplus)
+        #include "hip_fp16_math_fwd.h"
+        #include "hip_vector_types.h"
+        #include "host_defines.h"
+
+        namespace std
+        {
+            template<> struct is_floating_point<_Float16> : std::true_type {};
+        }
+
+        template<bool cond, typename T = void>
+        using Enable_if_t = typename std::enable_if<cond, T>::type;
+
+        // BEGIN STRUCT __HALF
+        struct __half {
+        protected:
+            union {
+                static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
+
+                _Float16 data;
+                unsigned short __x;
+            };
+        public:
+            // CREATORS
+            __HOST_DEVICE__
+            __half() = default;
+            __HOST_DEVICE__
+            __half(const __half_raw& x) : data{x.data} {}
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                __HOST_DEVICE__
+                __half(decltype(data) x) : data{x} {}
+                template<
+                    typename T,
+                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+                __HOST_DEVICE__
+                __half(T x) : data{static_cast<_Float16>(x)} {}
+            #endif
+            __HOST_DEVICE__
+            __half(const __half&) = default;
+            __HOST_DEVICE__
+            __half(__half&&) = default;
+            __HOST_DEVICE__
+            ~__half() = default;
+
+            // CREATORS - DEVICE ONLY
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                template<
+                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+                __HOST_DEVICE__
+                __half(T x) : data{static_cast<_Float16>(x)} {}
+            #endif
+
+            // MANIPULATORS
+            __HOST_DEVICE__
+            __half& operator=(const __half&) = default;
+            __HOST_DEVICE__
+            __half& operator=(__half&&) = default;
+            __HOST_DEVICE__
+            __half& operator=(const __half_raw& x)
+            {
+                data = x.data;
+                return *this;
+            }
+            __HOST_DEVICE__
+            volatile __half& operator=(const __half_raw& x) volatile
+            {
+                data = x.data;
+                return *this;
+            }
+            volatile __half& operator=(const volatile __half_raw& x) volatile
+            {
+                data = x.data;
+                return *this;
+            }
+            __half& operator=(__half_raw&& x)
+            {
+                data = x.data;
+                return *this;
+            }
+            volatile __half& operator=(__half_raw&& x) volatile
+            {
+                data = x.data;
+                return *this;
+            }
+            volatile __half& operator=(volatile __half_raw&& x) volatile
+            {
+                data = x.data;
+                return *this;
+            }
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                template<
+                    typename T,
+                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+                __HOST_DEVICE__
+                __half& operator=(T x)
+                {
+                    data = static_cast<_Float16>(x);
+                    return *this;
+                }
+            #endif
+
+            // MANIPULATORS - DEVICE ONLY
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                template<
+                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+                __device__
+                __half& operator=(T x)
+                {
+                    data = static_cast<_Float16>(x);
+                    return *this;
+                }
+            #endif
+
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                __device__
+                __half& operator+=(const __half& x)
+                {
+                    data += x.data;
+                    return *this;
+                }
+                __device__
+                __half& operator-=(const __half& x)
+                {
+                    data -= x.data;
+                    return *this;
+                }
+                __device__
+                __half& operator*=(const __half& x)
+                {
+                    data *= x.data;
+                    return *this;
+                }
+                __device__
+                __half& operator/=(const __half& x)
+                {
+                    data /= x.data;
+                    return *this;
+                }
+                __device__
+                __half& operator++() { ++data; return *this; }
+                __device__
+                __half operator++(int)
+                {
+                    __half tmp{*this};
+                    ++*this;
+                    return tmp;
+                }
+                __device__
+                __half& operator--() { --data; return *this; }
+                __device__
+                __half operator--(int)
+                {
+                    __half tmp{*this};
+                    --*this;
+                    return tmp;
+                }
+            #endif
+
+            // ACCESSORS
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                template<
+                    typename T,
+                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+                __HOST_DEVICE__
+                operator T() const { return data; }
+            #endif
+            __HOST_DEVICE__
+            operator __half_raw() const { return __half_raw{data}; }
+            __HOST_DEVICE__
+            operator __half_raw() const volatile
+            {
+                return __half_raw{data};
+            }
+
+            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+                template<
+                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+                __HOST_DEVICE__
+                operator T() const { return data; }
+            #endif
+
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                __device__
+                __half operator+() const { return *this; }
+                __device__
+                __half operator-() const
+                {
+                    __half tmp{*this};
+                    tmp.data = -tmp.data;
+                    return tmp;
+                }
+            #endif
+
+            // FRIENDS
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                friend
+                inline
+                __device__
+                __half operator+(const __half& x, const __half& y)
+                {
+                    return __half{x} += y;
+                }
+                friend
+                inline
+                __device__
+                __half operator-(const __half& x, const __half& y)
+                {
+                    return __half{x} -= y;
+                }
+                friend
+                inline
+                __device__
+                __half operator*(const __half& x, const __half& y)
+                {
+                    return __half{x} *= y;
+                }
+                friend
+                inline
+                __device__
+                __half operator/(const __half& x, const __half& y)
+                {
+                    return __half{x} /= y;
+                }
+                friend
+                inline
+                __device__
+                bool operator==(const __half& x, const __half& y)
+                {
+                    return x.data == y.data;
+                }
+                friend
+                inline
+                __device__
+                bool operator!=(const __half& x, const __half& y)
+                {
+                    return !(x == y);
+                }
+                friend
+                inline
+                __device__
+                bool operator<(const __half& x, const __half& y)
+                {
+                    return x.data < y.data;
+                }
+                friend
+                inline
+                __device__
+                bool operator>(const __half& x, const __half& y)
+                {
+                    return y.data < x.data;
+                }
+                friend
+                inline
+                __device__
+                bool operator<=(const __half& x, const __half& y)
+                {
+                    return !(y < x);
+                }
+                friend
+                inline
+                __device__
+                bool operator>=(const __half& x, const __half& y)
+                {
+                    return !(x < y);
+                }
+            #endif // !defined(__HIP_NO_HALF_OPERATORS__)
+        };
+        // END STRUCT __HALF
+
+        // BEGIN STRUCT __HALF2
+        struct __half2 {
+        public:
+            union {
+                static_assert(
+                    sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
+
+                _Float16_2 data;
+                struct {
+                    unsigned short x;
+                    unsigned short y;
+                };
+            };
+
+            // CREATORS
+            __HOST_DEVICE__
+            __half2() = default;
+            __HOST_DEVICE__
+            __half2(const __half2_raw& x) : data{x.data} {}
+            __HOST_DEVICE__
+            __half2(decltype(data) x) : data{x} {}
+            __HOST_DEVICE__
+            __half2(const __half& x, const __half& y)
+                :
+                data{
+                    static_cast<__half_raw>(x).data,
+                    static_cast<__half_raw>(y).data}
+            {}
+            __HOST_DEVICE__
+            __half2(const __half2&) = default;
+            __HOST_DEVICE__
+            __half2(__half2&&) = default;
+            __HOST_DEVICE__
+            ~__half2() = default;
+
+            // MANIPULATORS
+            __HOST_DEVICE__
+            __half2& operator=(const __half2&) = default;
+            __HOST_DEVICE__
+            __half2& operator=(__half2&&) = default;
+            __HOST_DEVICE__
+            __half2& operator=(const __half2_raw& x)
+            {
+                data = x.data;
+                return *this;
+            }
+
+            // MANIPULATORS - DEVICE ONLY
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                __device__
+                __half2& operator+=(const __half2& x)
+                {
+                    data += x.data;
+                    return *this;
+                }
+                __device__
+                __half2& operator-=(const __half2& x)
+                {
+                    data -= x.data;
+                    return *this;
+                }
+                __device__
+                __half2& operator*=(const __half2& x)
+                {
+                    data *= x.data;
+                    return *this;
+                }
+                __device__
+                __half2& operator/=(const __half2& x)
+                {
+                    data /= x.data;
+                    return *this;
+                }
+                __device__
+                __half2& operator++() { return *this += _Float16_2{1, 1}; }
+                __device__
+                __half2 operator++(int)
+                {
+                    __half2 tmp{*this};
+                    ++*this;
+                    return tmp;
+                }
+                __device__
+                __half2& operator--() { return *this -= _Float16_2{1, 1}; }
+                __device__
+                __half2 operator--(int)
+                {
+                    __half2 tmp{*this};
+                    --*this;
+                    return tmp;
+                }
+            #endif
+
+            // ACCESSORS
+            __HOST_DEVICE__
+            operator decltype(data)() const { return data; }
+            __HOST_DEVICE__
+            operator __half2_raw() const { return __half2_raw{data}; }
+
+            // ACCESSORS - DEVICE ONLY
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                __device__
+                __half2 operator+() const { return *this; }
+                __device__
+                __half2 operator-() const
+                {
+                    __half2 tmp{*this};
+                    tmp.data = -tmp.data;
+                    return tmp;
+                }
+            #endif
+
+            // FRIENDS
+            #if !defined(__HIP_NO_HALF_OPERATORS__)
+                friend
+                inline
+                __device__
+                __half2 operator+(const __half2& x, const __half2& y)
+                {
+                    return __half2{x} += y;
+                }
+                friend
+                inline
+                __device__
+                __half2 operator-(const __half2& x, const __half2& y)
+                {
+                    return __half2{x} -= y;
+                }
+                friend
+                inline
+                __device__
+                __half2 operator*(const __half2& x, const __half2& y)
+                {
+                    return __half2{x} *= y;
+                }
+                friend
+                inline
+                __device__
+                __half2 operator/(const __half2& x, const __half2& y)
+                {
+                    return __half2{x} /= y;
+                }
+                friend
+                inline
+                __device__
+                bool operator==(const __half2& x, const __half2& y)
+                {
+                    auto r = x.data == y.data;
+                    return r.x != 0 && r.y != 0;
+                }
+                friend
+                inline
+                __device__
+                bool operator!=(const __half2& x, const __half2& y)
+                {
+                    return !(x == y);
+                }
+                friend
+                inline
+                __device__
+                bool operator<(const __half2& x, const __half2& y)
+                {
+                    auto r = x.data < y.data;
+                    return r.x != 0 && r.y != 0;
+                }
+                friend
+                inline
+                __device__
+                bool operator>(const __half2& x, const __half2& y)
+                {
+                    return y < x;
+                }
+                friend
+                inline
+                __device__
+                bool operator<=(const __half2& x, const __half2& y)
+                {
+                    return !(y < x);
+                }
+                friend
+                inline
+                __device__
+                bool operator>=(const __half2& x, const __half2& y)
+                {
+                    return !(x < y);
+                }
+            #endif // !defined(__HIP_NO_HALF_OPERATORS__)
+        };
+        // END STRUCT __HALF2
+
+        namespace
+        {
+            inline
+            __HOST_DEVICE__
+            __half2 make_half2(__half x, __half y)
+            {
+                return __half2{x, y};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half __low2half(__half2 x)
+            {
+                return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half __high2half(__half2 x)
+            {
+                return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __half2half2(__half x)
+            {
+                return __half2{x, x};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __halves2half2(__half x, __half y)
+            {
+                return __half2{x, y};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __low2half2(__half2 x)
+            {
+                return __half2{
+                    _Float16_2{
+                        static_cast<__half2_raw>(x).data.x,
+                        static_cast<__half2_raw>(x).data.x}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __high2half2(__half2 x)
+            {
+                return __half2_raw{
+                    _Float16_2{
+                        static_cast<__half2_raw>(x).data.y,
+                        static_cast<__half2_raw>(x).data.y}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __lows2half2(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    _Float16_2{
+                        static_cast<__half2_raw>(x).data.x,
+                        static_cast<__half2_raw>(y).data.x}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __highs2half2(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    _Float16_2{
+                        static_cast<__half2_raw>(x).data.y,
+                        static_cast<__half2_raw>(y).data.y}};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __lowhigh2highlow(__half2 x)
+            {
+                return __half2_raw{
+                    _Float16_2{
+                        static_cast<__half2_raw>(x).data.y,
+                        static_cast<__half2_raw>(x).data.x}};
+            }
+
+            // Bitcasts
+            inline
+            __device__
+            short __half_as_short(__half x)
+            {
+                return static_cast<__half_raw>(x).x;
+            }
+
+            inline
+            __device__
+            unsigned short __half_as_ushort(__half x)
+            {
+                return static_cast<__half_raw>(x).x;
+            }
+
+            inline
+            __device__
+            __half __short_as_half(short x)
+            {
+                __half_raw r; r.x = x;
+                return r;
+            }
+
+            inline
+            __device__
+            __half __ushort_as_half(unsigned short x)
+            {
+                __half_raw r; r.x = x;
+                return r;
+            }
+
+            // TODO: rounding behaviour is not correct.
+            // float -> half | half2
+            inline
+            __HOST_DEVICE__
+            __half __float2half(float x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __float2half_rn(float x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __float2half_rz(float x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __float2half_rd(float x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __float2half_ru(float x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __float2half2_rn(float x)
+            {
+                return __half2_raw{
+                    _Float16_2{
+                        static_cast<_Float16>(x), static_cast<_Float16>(x)}};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __floats2half2_rn(float x, float y)
+            {
+                return __half2_raw{_Float16_2{
+                    static_cast<_Float16>(x), static_cast<_Float16>(y)}};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __float22half2_rn(float2 x)
+            {
+                return __floats2half2_rn(x.x, x.y);
+            }
+
+            // half | half2 -> float
+            inline
+            __HOST_DEVICE__
+            float __half2float(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __HOST_DEVICE__
+            float __low2float(__half2 x)
+            {
+                return static_cast<__half2_raw>(x).data.x;
+            }
+            inline
+            __HOST_DEVICE__
+            float __high2float(__half2 x)
+            {
+                return static_cast<__half2_raw>(x).data.y;
+            }
+            inline
+            __HOST_DEVICE__
+            float2 __half22float2(__half2 x)
+            {
+                return make_float2(
+                    static_cast<__half2_raw>(x).data.x,
+                    static_cast<__half2_raw>(x).data.y);
+            }
+
+            // half -> int
+            inline
+            __device__
+            int __half2int_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            int __half2int_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            int __half2int_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            int __half2int_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // int -> half
+            inline
+            __device__
+            __half __int2half_rn(int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __int2half_rz(int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __int2half_rd(int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __int2half_ru(int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // half -> short
+            inline
+            __device__
+            short __half2short_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            short __half2short_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            short __half2short_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            short __half2short_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // short -> half
+            inline
+            __device__
+            __half __short2half_rn(short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __short2half_rz(short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __short2half_rd(short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __short2half_ru(short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // half -> long long
+            inline
+            __device__
+            long long __half2ll_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            long long __half2ll_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            long long __half2ll_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            long long __half2ll_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // long long -> half
+            inline
+            __device__
+            __half __ll2half_rn(long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ll2half_rz(long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ll2half_rd(long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ll2half_ru(long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // half -> unsigned int
+            inline
+            __device__
+            unsigned int __half2uint_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned int __half2uint_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned int __half2uint_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned int __half2uint_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // unsigned int -> half
+            inline
+            __device__
+            __half __uint2half_rn(unsigned int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __uint2half_rz(unsigned int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __uint2half_rd(unsigned int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __uint2half_ru(unsigned int x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // half -> unsigned short
+            inline
+            __device__
+            unsigned short __half2ushort_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned short __half2ushort_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned short __half2ushort_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned short __half2ushort_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // unsigned short -> half
+            inline
+            __device__
+            __half __ushort2half_rn(unsigned short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ushort2half_rz(unsigned short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ushort2half_rd(unsigned short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ushort2half_ru(unsigned short x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // half -> unsigned long long
+            inline
+            __device__
+            unsigned long long __half2ull_rn(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned long long __half2ull_rz(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned long long __half2ull_rd(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+            inline
+            __device__
+            unsigned long long __half2ull_ru(__half x)
+            {
+                return static_cast<__half_raw>(x).data;
+            }
+
+            // unsigned long long -> half
+            inline
+            __device__
+            __half __ull2half_rn(unsigned long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ull2half_rz(unsigned long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ull2half_rd(unsigned long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+            inline
+            __device__
+            __half __ull2half_ru(unsigned long long x)
+            {
+                return __half_raw{static_cast<_Float16>(x)};
+            }
+
+            // Load primitives
+            inline
+            __device__
+            __half __ldg(const __half* ptr) { return *ptr; }
+            inline
+            __device__
+            __half __ldcg(const __half* ptr) { return *ptr; }
+            inline
+            __device__
+            __half __ldca(const __half* ptr) { return *ptr; }
+            inline
+            __device__
+            __half __ldcs(const __half* ptr) { return *ptr; }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __ldg(const __half2* ptr) { return *ptr; }
+            inline
+            __HOST_DEVICE__
+            __half2 __ldcg(const __half2* ptr) { return *ptr; }
+            inline
+            __HOST_DEVICE__
+            __half2 __ldca(const __half2* ptr) { return *ptr; }
+            inline
+            __HOST_DEVICE__
+            __half2 __ldcs(const __half2* ptr) { return *ptr; }
+
+            // Relations
+            inline
+            __device__
+            bool __heq(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data ==
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hne(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data !=
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hle(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data <=
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hge(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data >=
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hlt(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data <
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hgt(__half x, __half y)
+            {
+                return static_cast<__half_raw>(x).data >
+                    static_cast<__half_raw>(y).data;
+            }
+            inline
+            __device__
+            bool __hequ(__half x, __half y) { return __heq(x, y); }
+            inline
+            __device__
+            bool __hneu(__half x, __half y) { return __hne(x, y); }
+            inline
+            __device__
+            bool __hleu(__half x, __half y) { return __hle(x, y); }
+            inline
+            __device__
+            bool __hgeu(__half x, __half y) { return __hge(x, y); }
+            inline
+            __device__
+            bool __hltu(__half x, __half y) { return __hlt(x, y); }
+            inline
+            __device__
+            bool __hgtu(__half x, __half y) { return __hgt(x, y); }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __heq2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data ==
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hne2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data !=
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hle2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data <=
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hge2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data >=
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hlt2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data <
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hgt2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(x).data >
+                    static_cast<__half2_raw>(y).data;
+                return __builtin_convertvector(-r, _Float16_2);
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }
+
+            inline
+            __HOST_DEVICE__
+            bool __hbeq2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__heq2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hbne2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hne2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hble2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hle2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hbge2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hge2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hblt2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hlt2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hbgt2(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hgt2(x, y));
+                return r.data.x != 0 && r.data.y != 0;
+            }
+            inline
+            __HOST_DEVICE__
+            bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
+            inline
+            __HOST_DEVICE__
+            bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
+            inline
+            __HOST_DEVICE__
+            bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
+            inline
+            __HOST_DEVICE__
+            bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
+            inline
+            __HOST_DEVICE__
+            bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
+            inline
+            __HOST_DEVICE__
+            bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
+
+            // Arithmetic
+            inline
+            __device__
+            __half __clamp_01(__half x)
+            {
+                auto r = static_cast<__half_raw>(x);
+
+                if (__hlt(x, __half_raw{0})) return __half_raw{0};
+                if (__hlt(__half_raw{1}, x)) return __half_raw{1};
+                return r;
+            }
+
+            inline
+            __device__
+            __half __hadd(__half x, __half y)
+            {
+                return __half_raw{
+                    static_cast<__half_raw>(x).data +
+                    static_cast<__half_raw>(y).data};
+            }
+	    inline
+	    __device__
+	    __half __habs(__half x)
+	    {
+	        return __half_raw{
+		    __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
+	    }
+            inline
+            __device__
+            __half __hsub(__half x, __half y)
+            {
+                return __half_raw{
+                    static_cast<__half_raw>(x).data -
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __device__
+            __half __hmul(__half x, __half y)
+            {
+                return __half_raw{
+                    static_cast<__half_raw>(x).data *
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __device__
+            __half __hadd_sat(__half x, __half y)
+            {
+                return __clamp_01(__hadd(x, y));
+            }
+            inline
+            __device__
+            __half __hsub_sat(__half x, __half y)
+            {
+                return __clamp_01(__hsub(x, y));
+            }
+            inline
+            __device__
+            __half __hmul_sat(__half x, __half y)
+            {
+                return __clamp_01(__hmul(x, y));
+            }
+            inline
+            __device__
+            __half __hfma(__half x, __half y, __half z)
+            {
+                return __half_raw{__ocml_fma_f16(
+                    static_cast<__half_raw>(x).data,
+                    static_cast<__half_raw>(y).data,
+                    static_cast<__half_raw>(z).data)};
+            }
+            inline
+            __device__
+            __half __hfma_sat(__half x, __half y, __half z)
+            {
+                return __clamp_01(__hfma(x, y, z));
+            }
+            inline
+            __device__
+            __half __hdiv(__half x, __half y)
+            {
+                return __half_raw{
+                    static_cast<__half_raw>(x).data /
+                    static_cast<__half_raw>(y).data};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 __hadd2(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    static_cast<__half2_raw>(x).data +
+                    static_cast<__half2_raw>(y).data};
+            }
+	    inline
+	    __HOST_DEVICE__
+	    __half2 __habs2(__half2 x)
+	    {
+	        return __half2_raw{
+		    __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
+	    }
+            inline
+            __HOST_DEVICE__
+            __half2 __hsub2(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    static_cast<__half2_raw>(x).data -
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hmul2(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    static_cast<__half2_raw>(x).data *
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hadd2_sat(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hadd2(x, y));
+                return __half2{
+                    __clamp_01(__half_raw{r.data.x}),
+                    __clamp_01(__half_raw{r.data.y})};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hsub2_sat(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hsub2(x, y));
+                return __half2{
+                    __clamp_01(__half_raw{r.data.x}),
+                    __clamp_01(__half_raw{r.data.y})};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hmul2_sat(__half2 x, __half2 y)
+            {
+                auto r = static_cast<__half2_raw>(__hmul2(x, y));
+                return __half2{
+                    __clamp_01(__half_raw{r.data.x}),
+                    __clamp_01(__half_raw{r.data.y})};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hfma2(__half2 x, __half2 y, __half2 z)
+            {
+                return __half2_raw{__ocml_fma_2f16(x, y, z)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
+            {
+                auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
+                return __half2{
+                    __clamp_01(__half_raw{r.data.x}),
+                    __clamp_01(__half_raw{r.data.y})};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __h2div(__half2 x, __half2 y)
+            {
+                return __half2_raw{
+                    static_cast<__half2_raw>(x).data /
+                    static_cast<__half2_raw>(y).data};
+            }
+
+            // Math functions
+            #if __HIP_CLANG_ONLY__
+            inline
+            __device__
+            float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
+                return __ockl_fdot2(static_cast<__half2_raw>(a).data,
+                                    static_cast<__half2_raw>(b).data,
+                                    c, saturate);
+            }
+            #endif
+            inline
+            __device__
+            __half htrunc(__half x)
+            {
+                return __half_raw{
+                    __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hceil(__half x)
+            {
+                return __half_raw{
+                    __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hfloor(__half x)
+            {
+                return __half_raw{
+                   __ocml_floor_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hrint(__half x)
+            {
+                return __half_raw{
+                    __ocml_rint_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hsin(__half x)
+            {
+                return __half_raw{
+                    __ocml_sin_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hcos(__half x)
+            {
+                return __half_raw{
+                    __ocml_cos_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hexp(__half x)
+            {
+                return __half_raw{
+                    __ocml_exp_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hexp2(__half x)
+            {
+                return __half_raw{
+                    __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hexp10(__half x)
+            {
+                return __half_raw{
+                    __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hlog2(__half x)
+            {
+                return __half_raw{
+                    __ocml_log2_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hlog(__half x)
+            {
+                return __half_raw{
+                    __ocml_log_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hlog10(__half x)
+            {
+                return __half_raw{
+                    __ocml_log10_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hrcp(__half x)
+            {
+                return __half_raw{
+                    __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hrsqrt(__half x)
+            {
+                return __half_raw{
+                    __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            __half hsqrt(__half x)
+            {
+                return __half_raw{
+                    __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
+            }
+            inline
+            __device__
+            bool __hisinf(__half x)
+            {
+                return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
+            }
+            inline
+            __device__
+            bool __hisnan(__half x)
+            {
+                return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
+            }
+            inline
+            __device__
+            __half __hneg(__half x)
+            {
+                return __half_raw{-static_cast<__half_raw>(x).data};
+            }
+
+            inline
+            __HOST_DEVICE__
+            __half2 h2trunc(__half2 x)
+            {
+                return __half2_raw{__ocml_trunc_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2ceil(__half2 x)
+            {
+                return __half2_raw{__ocml_ceil_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2floor(__half2 x)
+            {
+                return __half2_raw{__ocml_floor_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2rint(__half2 x)
+            {
+                return __half2_raw{__ocml_rint_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2sin(__half2 x)
+            {
+                return __half2_raw{__ocml_sin_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2cos(__half2 x)
+            {
+                return __half2_raw{__ocml_cos_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2exp(__half2 x)
+            {
+                return __half2_raw{__ocml_exp_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2exp2(__half2 x)
+            {
+                return __half2_raw{__ocml_exp2_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2exp10(__half2 x)
+            {
+                return __half2_raw{__ocml_exp10_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2log2(__half2 x)
+            {
+                return __half2_raw{__ocml_log2_2f16(x)};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
+            inline
+            __HOST_DEVICE__
+            __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
+            inline
+            __HOST_DEVICE__
+            __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
+            inline
+            __HOST_DEVICE__
+            __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
+            inline
+            __HOST_DEVICE__
+            __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
+            inline
+            __HOST_DEVICE__
+            __half2 __hisinf2(__half2 x)
+            {
+                auto r = __ocml_isinf_2f16(x);
+                return __half2_raw{_Float16_2{
+                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hisnan2(__half2 x)
+            {
+                auto r = __ocml_isnan_2f16(x);
+                return __half2_raw{_Float16_2{
+                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hneg2(__half2 x)
+            {
+                return __half2_raw{-static_cast<__half2_raw>(x).data};
+            }
+        } // Anonymous namespace.
+
+        #if !defined(HIP_NO_HALF)
+            using half = __half;
+            using half2 = __half2;
+        #endif
+    #endif // defined(__cplusplus)
+#elif defined(__GNUC__)
+    #include "hip_fp16_gcc.h"
+#endif // !defined(__clang__) && defined(__GNUC__)
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
diff --git a/include/hip/amd_detail/hip_fp16_gcc.h b/include/hip/amd_detail/hip_fp16_gcc.h
new file mode 100644
index 0000000000..e76a7fff3a
--- /dev/null
+++ b/include/hip/amd_detail/hip_fp16_gcc.h
@@ -0,0 +1,254 @@
+#pragma once
+
+#if defined(__cplusplus)
+    #include <cstring>
+#endif
+
+struct __half_raw {
+    unsigned short x;
+};
+
+struct __half2_raw {
+    unsigned short x;
+    unsigned short y;
+};
+
+#if defined(__cplusplus)
+    struct __half;
+
+    __half __float2half(float);
+    float __half2float(__half);
+
+    // BEGIN STRUCT __HALF
+    struct __half {
+    protected:
+        unsigned short __x;
+    public:
+        // CREATORS
+        __half() = default;
+        __half(const __half_raw& x) : __x{x.x} {}
+        #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+            __half(float x) : __x{__float2half(x).__x} {}
+            __half(double x) : __x{__float2half(x).__x} {}
+        #endif
+        __half(const __half&) = default;
+        __half(__half&&) = default;
+        ~__half() = default;
+
+        // MANIPULATORS
+        __half& operator=(const __half&) = default;
+        __half& operator=(__half&&) = default;
+        __half& operator=(const __half_raw& x) { __x = x.x; return *this; }
+        #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+            __half& operator=(float x)
+            {
+                __x = __float2half(x).__x;
+                return *this;
+            }
+            __half& operator=(double x)
+            {
+                return *this = static_cast<float>(x);
+            }
+        #endif
+
+        // ACCESSORS
+        operator float() const { return __half2float(*this); }
+        operator __half_raw() const { return __half_raw{__x}; }
+    };
+    // END STRUCT __HALF
+
+    // BEGIN STRUCT __HALF2
+    struct __half2 {
+    public:
+        __half x;
+        __half y;
+
+        // CREATORS
+        __half2() = default;
+        __half2(const __half2_raw& ix)
+            :
+            x{reinterpret_cast<const __half&>(ix.x)},
+            y{reinterpret_cast<const __half&>(ix.y)}
+        {}
+        __half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
+        __half2(const __half2&) = default;
+        __half2(__half2&&) = default;
+        ~__half2() = default;
+
+        // MANIPULATORS
+        __half2& operator=(const __half2&) = default;
+        __half2& operator=(__half2&&) = default;
+        __half2& operator=(const __half2_raw& ix)
+        {
+            x = reinterpret_cast<const __half_raw&>(ix.x);
+            y = reinterpret_cast<const __half_raw&>(ix.y);
+            return *this;
+        }
+
+        // ACCESSORS
+        operator __half2_raw() const
+        {
+            return __half2_raw{
+                reinterpret_cast<const unsigned short&>(x),
+                reinterpret_cast<const unsigned short&>(y)};
+        }
+    };
+    // END STRUCT __HALF2
+
+    inline
+    unsigned short __internal_float2half(
+        float flt, unsigned int& sgn, unsigned int& rem)
+    {
+        unsigned int x{};
+        std::memcpy(&x, &flt, sizeof(flt));
+
+        unsigned int u = (x & 0x7fffffffU);
+        sgn = ((x >> 16) & 0x8000U);
+
+        // NaN/+Inf/-Inf
+        if (u >= 0x7f800000U) {
+            rem = 0;
+            return static_cast<unsigned short>(
+                (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        }
+        // Overflows
+        if (u > 0x477fefffU) {
+            rem = 0x80000000U;
+            return static_cast<unsigned short>(sgn | 0x7bffU);
+        }
+        // Normal numbers
+        if (u >= 0x38800000U) {
+            rem = u << 19;
+            u -= 0x38000000U;
+            return static_cast<unsigned short>(sgn | (u >> 13));
+        }
+        // +0/-0
+        if (u < 0x33000001U) {
+            rem = u;
+            return static_cast<unsigned short>(sgn);
+        }
+        // Denormal numbers
+        unsigned int exponent = u >> 23;
+        unsigned int mantissa = (u & 0x7fffffU);
+        unsigned int shift = 0x7eU - exponent;
+        mantissa |= 0x800000U;
+        rem = mantissa << (32 - shift);
+        return static_cast<unsigned short>(sgn | (mantissa >> shift));
+    }
+
+    inline
+    __half __float2half(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rn(float x) { return __float2half(x); }
+
+    inline
+    __half __float2half_rz(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rd(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_ru(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && !sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half2 __float2half2_rn(float x)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(x)};
+    }
+
+    inline
+    __half2 __floats2half2_rn(float x, float y)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(y)};
+    }
+
+    inline
+    float __internal_half2float(unsigned short x)
+    {
+        unsigned int sign = ((x >> 15) & 1);
+        unsigned int exponent = ((x >> 10) & 0x1f);
+        unsigned int mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1fU) { /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
+            exponent = 0xffU;
+        } else if (!exponent) { /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71U;
+                do {
+                    msb = (mantissa & 0x400000U);
+                    mantissa <<= 1; /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+            }
+        } else {
+            exponent += 0x70U;
+        }
+        unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
+        float f;
+        memcpy(&f, &u, sizeof(u));
+
+        return f;
+    }
+
+    inline
+    float __half2float(__half x)
+    {
+        return __internal_half2float(static_cast<__half_raw>(x).x);
+    }
+
+    inline
+    float __low2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).x);
+    }
+
+    inline
+    float __high2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).y);
+    }
+
+    #if !defined(HIP_NO_HALF)
+        using half = __half;
+        using half2 = __half2;
+    #endif
+#endif // defined(__cplusplus)
diff --git a/include/hip/amd_detail/hip_fp16_math_fwd.h b/include/hip/amd_detail/hip_fp16_math_fwd.h
new file mode 100644
index 0000000000..7d2cf22bc3
--- /dev/null
+++ b/include/hip/amd_detail/hip_fp16_math_fwd.h
@@ -0,0 +1,86 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// /*
+// Half Math Functions
+// */
+
+#include "host_defines.h"
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+extern "C"
+{
+    __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+    __device__ _Float16 __ocml_cos_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+    __device__ __attribute__((const))
+    _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+    __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+    __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
+    __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+    __device__ _Float16 __ocml_sin_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+
+    typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+    typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+    #if __HIP_CLANG_ONLY__
+    __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
+    #endif
+
+    __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+    __device__ __2f16 __ocml_cos_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+    __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+    __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+    __device__ inline
+    __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL.
+    {
+        return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)};
+    }
+    __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+    __device__ __2f16 __ocml_sin_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+}
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
diff --git a/include/hip/amd_detail/hip_ldg.h b/include/hip/amd_detail/hip_ldg.h
new file mode 100644
index 0000000000..4b8b1227a1
--- /dev/null
+++ b/include/hip/amd_detail/hip_ldg.h
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+
+#if __HIP_CLANG_ONLY__
+#include "hip_vector_types.h"
+#include "host_defines.h"
+
+__device__ inline static char __ldg(const char* ptr) { return *ptr; }
+
+__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }
+
+__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }
+
+__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
+
+
+__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }
+
+__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
+
+__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
+
+
+__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }
+
+__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
+
+__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
+
+
+__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
+
+
+__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }
+
+__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
+
+
+__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
+
+__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
+
+
+__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
+
+__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
+
+
+__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }
+
+__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
+
+__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
+
+
+__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }
+
+__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
+
+#endif  // __HIP_CLANG_ONLY__
+
+#endif  // HIP_LDG_H
diff --git a/include/hip/amd_detail/hip_memory.h b/include/hip/amd_detail/hip_memory.h
new file mode 100644
index 0000000000..f2c01633ea
--- /dev/null
+++ b/include/hip/amd_detail/hip_memory.h
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_MEMORY_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_MEMORY_H
+
+// Implementation of malloc and free device functions.
+// HIP heap is implemented as a global array with fixed size. Users may define
+// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
+
+#if __HIP__ && __HIP_ENABLE_DEVICE_MALLOC__
+
+// Size of page in bytes.
+#ifndef __HIP_SIZE_OF_PAGE
+#define __HIP_SIZE_OF_PAGE 64
+#endif
+
+// Total number of pages
+#ifndef __HIP_NUM_PAGES
+#define __HIP_NUM_PAGES (16 * 64 * 64)
+#endif
+
+#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
+
+#if __HIP_DEVICE_COMPILE__
+__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
+__attribute__((weak)) __device__
+    uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
+#else
+extern __device__ char __hip_device_heap[];
+extern __device__ uint32_t __hip_device_page_flag[];
+#endif
+
+extern "C" inline __device__ void* __hip_malloc(size_t size) {
+    char* heap = (char*)__hip_device_heap;
+    if (size > __HIP_SIZE_OF_HEAP) {
+        return (void*)nullptr;
+    }
+    uint32_t totalThreads =
+        hipBlockDim_x * hipGridDim_x * hipBlockDim_y
+        * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
+        + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
+        + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
+        * hipBlockDim_y;
+
+    uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
+    uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
+
+    uint32_t stride = size / __HIP_SIZE_OF_PAGE;
+    uint32_t start = numHeapsPerWorkItem * currentWorkItem;
+
+    uint32_t k = 0;
+
+    while (__hip_device_page_flag[k] > 0) {
+        k++;
+    }
+
+    for (uint32_t i = 0; i < stride - 1; i++) {
+        __hip_device_page_flag[i + start + k] = 1;
+    }
+
+    __hip_device_page_flag[start + stride - 1 + k] = 2;
+
+    void* ptr = (void*)(heap
+        + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
+
+    return ptr;
+}
+
+extern "C" inline __device__ void* __hip_free(void* ptr) {
+    if (ptr == nullptr) {
+        return nullptr;
+    }
+
+    uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
+    uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
+
+    while (__hip_device_page_flag[offsetPage] != 0) {
+        if (__hip_device_page_flag[offsetPage] == 2) {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+            break;
+        } else {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+        }
+    }
+
+    return nullptr;
+}
+
+#endif
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_MEMORY_H
diff --git a/include/hip/amd_detail/hip_runtime.h b/include/hip/amd_detail/hip_runtime.h
new file mode 100644
index 0000000000..a3db57ffe3
--- /dev/null
+++ b/include/hip/amd_detail/hip_runtime.h
@@ -0,0 +1,417 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_runtime.h
+ *  @brief Contains definitions of APIs for HIP runtime.
+ */
+
+//#pragma once
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+
+#include <hip/amd_detail/hip_common.h>
+
+//---
+// Top part of file can be compiled with any compiler
+
+#if !defined(__HIPCC_RTC__)
+//#include <cstring>
+#if __cplusplus
+#include <cmath>
+#include <cstdint>
+#else
+#include <math.h>
+#include <string.h>
+#include <stddef.h>
+#endif // __cplusplus
+#endif // !defined(__HIPCC_RTC__)
+
+// __hip_malloc is not working. Disable it by default.
+#ifndef __HIP_ENABLE_DEVICE_MALLOC__
+#define __HIP_ENABLE_DEVICE_MALLOC__ 0
+#endif
+
+#if __HIP_CLANG_ONLY__
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#define CUDA_SUCCESS hipSuccess
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+extern int HIP_TRACE_API;
+#endif // !defined(__HIPCC_RTC__)
+
+#ifdef __cplusplus
+#include <hip/amd_detail/hip_ldg.h>
+#endif
+#include <hip/amd_detail/hip_atomic.h>
+#include <hip/amd_detail/host_defines.h>
+#include <hip/amd_detail/device_functions.h>
+#include <hip/amd_detail/surface_functions.h>
+#include <hip/amd_detail/texture_fetch_functions.h>
+#include <hip/amd_detail/texture_indirect_functions.h>
+
+// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
+#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
+#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
+#endif
+
+// Feature tests:
+#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
+// Device compile and not host compile:
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (1)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (1)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+
+#endif /* Device feature flags */
+
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
+                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...)                                                                     \
+    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
+
+#if !defined(__HIPCC_RTC__)
+__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+#endif // !defined(__HIPCC_RTC__)
+
+#if __HIP_ARCH_GFX701__ == 0
+
+__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
+__device__ float __hip_ds_bpermutef(int index, float src);
+__device__ unsigned __hip_ds_permute(int index, unsigned src);
+__device__ float __hip_ds_permutef(int index, float src);
+
+template <int pattern>
+__device__ unsigned __hip_ds_swizzle_N(unsigned int src);
+template <int pattern>
+__device__ float __hip_ds_swizzlef_N(float src);
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ int __hip_move_dpp_N(int src);
+
+#endif  //__HIP_ARCH_GFX803__ == 1
+
+#ifndef __OPENMP_AMDGCN__
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if __HIP_ENABLE_DEVICE_MALLOC__
+extern "C" __device__ void* __hip_malloc(size_t);
+extern "C" __device__ void* __hip_free(void* ptr);
+static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
+#else
+static inline __device__ void* malloc(size_t size) { __builtin_trap(); return nullptr; }
+static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullptr; }
+#endif
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif // !__OPENMP_AMDGCN__
+
+// End doxygen API:
+/**
+ *   @}
+ */
+
+//
+// hip-clang functions
+//
+#if !defined(__HIPCC_RTC__)
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+#define HIP_SYMBOL(X) X
+
+typedef int hipLaunchParm;
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>&, void*) {}
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
+    using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
+
+    static_assert(!std::is_reference<T>{},
+                  "A __global__ function cannot have a reference as one of its "
+                  "arguments.");
+#if defined(HIP_STRICT)
+    static_assert(std::is_trivially_copyable<T>{},
+                  "Only TriviallyCopyable types can be arguments to a __global__ "
+                  "function");
+#endif
+    _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
+    return pArgs<n + 1>(formals, _vargs);
+}
+
+template <typename... Formals, typename... Actuals>
+std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
+    static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
+    std::tuple<Formals...> to_formals{std::move(actuals)};
+    return to_formals;
+}
+
+#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
+template <typename... Args, typename F = void (*)(Args...)>
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
+    constexpr size_t count = sizeof...(Args);
+    auto tup_ = std::tuple<Args...>{args...};
+    auto tup = validateArgsCountType(kernel, tup_);
+    void* _Args[count];
+    pArgs<0>(tup, _Args);
+
+    auto k = reinterpret_cast<void*>(kernel);
+    hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
+}
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__);         \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+#endif
+
+#include <hip/hip_runtime_api.h>
+#endif // !defined(__HIPCC_RTC__)
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+struct __HIP_BlockIdx {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
+};
+struct __HIP_BlockDim {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_local_size(x);
+  }
+};
+struct __HIP_GridDim {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_num_groups(x);
+  }
+};
+struct __HIP_ThreadIdx {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_local_id(x);
+  }
+};
+
+#if defined(__HIPCC_RTC__)
+typedef struct dim3 {
+    uint32_t x;  ///< x
+    uint32_t y;  ///< y
+    uint32_t z;  ///< z
+#ifdef __cplusplus
+    constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+#endif // !defined(__HIPCC_RTC__)
+
+template <typename F>
+struct __HIP_Coordinates {
+  using R = decltype(F{}(0));
+
+  struct __X { __device__ operator R() const noexcept { return F{}(0); } };
+  struct __Y { __device__ operator R() const noexcept { return F{}(1); } };
+  struct __Z { __device__ operator R() const noexcept { return F{}(2); } };
+
+  static constexpr __X x{};
+  static constexpr __Y y{};
+  static constexpr __Z z{};
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+
+};
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
+                        __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
+  return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
+                        __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
+  return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
+                        __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
+  return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
+                        __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
+  return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
+                        __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
+  return __ockl_get_global_size(2);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
+                        __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
+  return __ockl_get_global_size(2);
+}
+
+static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
+static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
+static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
+static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+#define hipThreadIdx_x (__ockl_get_local_id(0))
+#define hipThreadIdx_y (__ockl_get_local_id(1))
+#define hipThreadIdx_z (__ockl_get_local_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+#define hipBlockIdx_x (__ockl_get_group_id(0))
+#define hipBlockIdx_y (__ockl_get_group_id(1))
+#define hipBlockIdx_z (__ockl_get_group_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+#define hipBlockDim_x (__ockl_get_local_size(0))
+#define hipBlockDim_y (__ockl_get_local_size(1))
+#define hipBlockDim_z (__ockl_get_local_size(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+#define hipGridDim_x (__ockl_get_num_groups(0))
+#define hipGridDim_y (__ockl_get_num_groups(1))
+#define hipGridDim_z (__ockl_get_num_groups(2))
+
+#include <hip/amd_detail/math_functions.h>
+
+#if __HIP_HCC_COMPAT_MODE__
+// Define HCC work item functions in terms of HIP builtin variables.
+#pragma push_macro("__DEFINE_HCC_FUNC")
+#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
+inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
+  if (i==0) \
+    return hip_var.x; \
+  else if(i==1) \
+    return hip_var.y; \
+  else \
+    return hip_var.z; \
+}
+
+__DEFINE_HCC_FUNC(workitem_id, threadIdx)
+__DEFINE_HCC_FUNC(group_id, blockIdx)
+__DEFINE_HCC_FUNC(group_size, blockDim)
+__DEFINE_HCC_FUNC(num_groups, gridDim)
+#pragma pop_macro("__DEFINE_HCC_FUNC")
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
+inline __device__ __attribute__((always_inline)) uint
+hc_get_workitem_absolute_id(int dim)
+{
+  return (uint)__ockl_get_global_id(dim);
+}
+
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if !defined(__HIPCC_RTC__)
+// Support std::complex.
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#pragma push_macro("__CUDA__")
+#define __CUDA__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+// Workaround for using libc++ with HIP-Clang.
+// The following headers requires clang include path before standard C++ include path.
+// However libc++ include path requires to be before clang include path.
+// To workaround this, we pass -isystem with the parent directory of clang include
+// path instead of the clang include path itself.
+#include <include/cuda_wrappers/algorithm>
+#include <include/cuda_wrappers/complex>
+#include <include/cuda_wrappers/new>
+#undef __CUDA__
+#pragma pop_macro("__CUDA__")
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // !defined(__HIPCC_RTC__)
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif // __HIP_CLANG_ONLY__
+
+#include <hip/amd_detail/hip_memory.h>
+
+#endif  // HIP_AMD_DETAIL_RUNTIME_H
diff --git a/include/hip/amd_detail/hip_runtime_api.h b/include/hip/amd_detail/hip_runtime_api.h
new file mode 100644
index 0000000000..b64f6e9ec1
--- /dev/null
+++ b/include/hip/amd_detail/hip_runtime_api.h
@@ -0,0 +1,4354 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//#pragma once
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_API_H
+/**
+ *  @file  amd_detail/hip_runtime_api.h
+ *  @brief Contains C function APIs for HIP runtime. This file does not use any HCC builtin or
+ * special language extensions (-hc mode) ; those functions in hip_runtime.h.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#include <hip/amd_detail/host_defines.h>
+#include <hip/amd_detail/driver_types.h>
+#include <hip/amd_detail/hip_texture_types.h>
+#include <hip/amd_detail/hip_surface_types.h>
+
+#if defined(_MSC_VER)
+#define DEPRECATED(msg) __declspec(deprecated(msg))
+#else // !defined(_MSC_VER)
+#define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
+#endif // !defined(_MSC_VER)
+
+#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
+#define HIP_LAUNCH_PARAM_END ((void*)0x03)
+
+#ifdef __cplusplus
+  #define __dparm(x) \
+          = x
+#else
+  #define __dparm(x)
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility push (default)
+#endif
+
+#ifdef __cplusplus
+namespace hip_impl {
+hipError_t hip_init();
+}  // namespace hip_impl
+#endif
+
+// Structure definitions:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//---
+// API-visible structures
+typedef struct ihipCtx_t* hipCtx_t;
+
+// Note many APIs also use integer deviceIds as an alternative to the device pointer:
+typedef int hipDevice_t;
+
+typedef enum hipDeviceP2PAttr {
+  hipDevP2PAttrPerformanceRank = 0,
+  hipDevP2PAttrAccessSupported,
+  hipDevP2PAttrNativeAtomicSupported,
+  hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+
+typedef struct ihipStream_t* hipStream_t;
+
+#define hipIpcMemLazyEnablePeerAccess 0
+
+#define HIP_IPC_HANDLE_SIZE 64
+
+typedef struct hipIpcMemHandle_st {
+    char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcMemHandle_t;
+
+typedef struct hipIpcEventHandle_st {
+    char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcEventHandle_t;
+
+typedef struct ihipModule_t* hipModule_t;
+
+typedef struct ihipModuleSymbol_t* hipFunction_t;
+
+typedef struct hipFuncAttributes {
+    int binaryVersion;
+    int cacheModeCA;
+    size_t constSizeBytes;
+    size_t localSizeBytes;
+    int maxDynamicSharedSizeBytes;
+    int maxThreadsPerBlock;
+    int numRegs;
+    int preferredShmemCarveout;
+    int ptxVersion;
+    size_t sharedSizeBytes;
+} hipFuncAttributes;
+
+typedef struct ihipEvent_t* hipEvent_t;
+
+enum hipLimit_t {
+    hipLimitMallocHeapSize = 0x02,
+};
+
+/**
+ * @addtogroup GlobalDefs More
+ * @{
+ */
+//! Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault                                                                           \
+    0x00  ///< Default stream creation flags. These are used with hipStreamCreate().
+#define hipStreamNonBlocking 0x01  ///< Stream does not implicitly synchronize with null stream
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault 0x0  ///< Default flags
+#define hipEventBlockingSync                                                                       \
+    0x1  ///< Waiting will yield CPU.  Power-friendly and usage-friendly but may increase latency.
+#define hipEventDisableTiming                                                                      \
+    0x2  ///< Disable event's capability to record timing information.  May improve performance.
+#define hipEventInterprocess 0x4  ///< Event can support IPC.  @warning - not supported in HIP.
+#define hipEventReleaseToDevice                                                                    \
+    0x40000000  /// < Use a device-scope release when recording this event.  This flag is useful to
+                /// obtain more precise timings of commands between events.  The flag is a no-op on
+                /// CUDA platforms.
+#define hipEventReleaseToSystem                                                                    \
+    0x80000000  /// < Use a system-scope release when recording this event.  This flag is
+                /// useful to make non-coherent host memory visible to the host.  The flag is a
+                /// no-op on CUDA platforms.
+
+
+//! Flags that can be used with hipHostMalloc
+#define hipHostMallocDefault 0x0
+#define hipHostMallocPortable 0x1  ///< Memory is considered allocated by all contexts.
+#define hipHostMallocMapped                                                                        \
+    0x2  ///< Map the allocation into the address space for the current device.  The device pointer
+         ///< can be obtained with #hipHostGetDevicePointer.
+#define hipHostMallocWriteCombined 0x4
+#define hipHostMallocNumaUser                                                                      \
+    0x20000000  ///< Host memory allocation will follow numa policy set by user
+
+#define hipHostMallocCoherent                                                                      \
+    0x40000000  ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
+                ///< allocation.
+#define hipHostMallocNonCoherent                                                                   \
+    0x80000000  ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
+                ///< allocation.
+
+#define hipMemAttachGlobal  0x01    ///< Memory can be accessed by any stream on any device
+#define hipMemAttachHost    0x02    ///< Memory cannot be accessed by any stream on any device
+#define hipMemAttachSingle  0x04    ///< Memory can only be accessed by a single stream on
+                                    ///< the associated device
+
+#define hipDeviceMallocDefault 0x0
+#define hipDeviceMallocFinegrained 0x1  ///< Memory is allocated in fine grained region of device.
+#define hipMallocSignalMemory 0x2       ///< Memory represents a HSA signal.
+
+//! Flags that can be used with hipHostRegister
+#define hipHostRegisterDefault 0x0   ///< Memory is Mapped and Portable
+#define hipHostRegisterPortable 0x1  ///< Memory is considered registered by all contexts.
+#define hipHostRegisterMapped                                                                      \
+    0x2  ///< Map the allocation into the address space for the current device.  The device pointer
+         ///< can be obtained with #hipHostGetDevicePointer.
+#define hipHostRegisterIoMemory 0x4  ///< Not supported.
+#define hipExtHostRegisterCoarseGrained 0x8  ///< Coarse Grained host memory lock
+
+#define hipDeviceScheduleAuto 0x0  ///< Automatically select between Spin and Yield
+#define hipDeviceScheduleSpin                                                                      \
+    0x1  ///< Dedicate a CPU core to spin-wait.  Provides lowest latency, but burns a CPU core and
+         ///< may consume more power.
+#define hipDeviceScheduleYield                                                                     \
+    0x2  ///< Yield the CPU to the operating system when waiting.  May increase latency, but lowers
+         ///< power and is friendlier to other threads in the system.
+#define hipDeviceScheduleBlockingSync 0x4
+#define hipDeviceScheduleMask 0x7
+
+#define hipDeviceMapHost 0x8
+#define hipDeviceLmemResizeToMax 0x16
+
+#define hipArrayDefault 0x00  ///< Default HIP array allocation flag
+#define hipArrayLayered 0x01
+#define hipArraySurfaceLoadStore 0x02
+#define hipArrayCubemap 0x04
+#define hipArrayTextureGather 0x08
+
+#define hipOccupancyDefault 0x00
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
+#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
+
+#define hipCpuDeviceId ((int)-1)
+#define hipInvalidDeviceId ((int)-2)
+
+// Flags that can be used with hipExtLaunch Set of APIs
+#define hipExtAnyOrderLaunch 0x01  ///< AnyOrderLaunch of kernels
+
+// Flags to be used with hipStreamWaitValue32 and hipStreamWaitValue64
+#define hipStreamWaitValueGte 0x0
+#define hipStreamWaitValueEq 0x1
+#define hipStreamWaitValueAnd 0x2
+#define hipStreamWaitValueNor 0x3
+
+/*
+ * @brief HIP Memory Advise values
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipMemoryAdvise {
+    hipMemAdviseSetReadMostly = 1,          ///< Data will mostly be read and only occassionally
+                                            ///< be written to
+    hipMemAdviseUnsetReadMostly = 2,        ///< Undo the effect of hipMemAdviseSetReadMostly
+    hipMemAdviseSetPreferredLocation = 3,   ///< Set the preferred location for the data as
+                                            ///< the specified device
+    hipMemAdviseUnsetPreferredLocation = 4, ///< Clear the preferred location for the data
+    hipMemAdviseSetAccessedBy = 5,          ///< Data will be accessed by the specified device,
+                                            ///< so prevent page faults as much as possible
+    hipMemAdviseUnsetAccessedBy = 6         ///< Let the Unified Memory subsystem decide on
+                                            ///< the page faulting policy for the specified device
+} hipMemoryAdvise;
+
+/*
+ * @brief HIP range attributes
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipMemRangeAttribute {
+    hipMemRangeAttributeReadMostly = 1,         ///< Whether the range will mostly be read and
+                                                ///< only occassionally be written to
+    hipMemRangeAttributePreferredLocation = 2,  ///< The preferred location of the range
+    hipMemRangeAttributeAccessedBy = 3,         ///< Memory range has cudaMemAdviseSetAccessedBy
+                                                ///< set for specified device
+    hipMemRangeAttributeLastPrefetchLocation = 4,///< The last location to which the range was prefetched
+} hipMemRangeAttribute;
+
+/*
+ * @brief hipJitOption
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipJitOption {
+    hipJitOptionMaxRegisters = 0,
+    hipJitOptionThreadsPerBlock,
+    hipJitOptionWallTime,
+    hipJitOptionInfoLogBuffer,
+    hipJitOptionInfoLogBufferSizeBytes,
+    hipJitOptionErrorLogBuffer,
+    hipJitOptionErrorLogBufferSizeBytes,
+    hipJitOptionOptimizationLevel,
+    hipJitOptionTargetFromContext,
+    hipJitOptionTarget,
+    hipJitOptionFallbackStrategy,
+    hipJitOptionGenerateDebugInfo,
+    hipJitOptionLogVerbose,
+    hipJitOptionGenerateLineInfo,
+    hipJitOptionCacheMode,
+    hipJitOptionSm3xOpt,
+    hipJitOptionFastCompile,
+    hipJitOptionNumOptions
+} hipJitOption;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncAttribute {
+    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
+    hipFuncAttributePreferredSharedMemoryCarveout = 9,
+    hipFuncAttributeMax
+} hipFuncAttribute;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncCache_t {
+    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
+    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
+    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
+    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
+} hipFuncCache_t;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipSharedMemConfig {
+    hipSharedMemBankSizeDefault,  ///< The compiler selects a device-specific value for the banking.
+    hipSharedMemBankSizeFourByte,  ///< Shared mem is banked at 4-bytes intervals and performs best
+                                   ///< when adjacent threads access data 4 bytes apart.
+    hipSharedMemBankSizeEightByte  ///< Shared mem is banked at 8-byte intervals and performs best
+                                   ///< when adjacent threads access data 4 bytes apart.
+} hipSharedMemConfig;
+
+/**
+ * Struct for data in 3D
+ *
+ */
+typedef struct dim3 {
+    uint32_t x;  ///< x
+    uint32_t y;  ///< y
+    uint32_t z;  ///< z
+#ifdef __cplusplus
+    constexpr __host__ __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+
+typedef struct hipLaunchParams_t {
+    void* func;             ///< Device function symbol
+    dim3 gridDim;           ///< Grid dimentions
+    dim3 blockDim;          ///< Block dimentions
+    void **args;            ///< Arguments
+    size_t sharedMem;       ///< Shared memory
+    hipStream_t stream;     ///< Stream identifier
+} hipLaunchParams;
+
+
+typedef enum hipExternalMemoryHandleType_enum {
+  hipExternalMemoryHandleTypeOpaqueFd = 1,
+  hipExternalMemoryHandleTypeOpaqueWin32 = 2,
+  hipExternalMemoryHandleTypeOpaqueWin32Kmt = 3,
+  hipExternalMemoryHandleTypeD3D12Heap = 4,
+  hipExternalMemoryHandleTypeD3D12Resource = 5,
+  hipExternalMemoryHandleTypeD3D11Resource = 6,
+  hipExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+} hipExternalMemoryHandleType;
+
+typedef struct hipExternalMemoryHandleDesc_st {
+  hipExternalMemoryHandleType type;
+  union {
+    int fd;
+    struct {
+      void *handle;
+      const void *name;
+    } win32;
+  } handle;
+  unsigned long long size;
+  unsigned int flags;
+} hipExternalMemoryHandleDesc;
+
+typedef struct hipExternalMemoryBufferDesc_st {
+  unsigned long long offset;
+  unsigned long long size;
+  unsigned int flags;
+} hipExternalMemoryBufferDesc;
+
+typedef void* hipExternalMemory_t;
+
+typedef enum hipExternalSemaphoreHandleType_enum {
+  hipExternalSemaphoreHandleTypeOpaqueFd = 1,
+  hipExternalSemaphoreHandleTypeOpaqueWin32 = 2,
+  hipExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+  hipExternalSemaphoreHandleTypeD3D12Fence = 4
+} hipExternalSemaphoreHandleType;
+
+
+typedef struct hipExternalSemaphoreHandleDesc_st {
+  hipExternalSemaphoreHandleType type;
+  union {
+    int fd;
+    struct {
+      void* handle;
+      const void* name;
+    } win32;
+  } handle;
+  unsigned int flags;
+} hipExternalSemaphoreHandleDesc;
+
+typedef void* hipExternalSemaphore_t;
+
+typedef struct hipExternalSemaphoreSignalParams_st {
+  struct {
+    struct {
+      unsigned long long value;
+    } fence;
+
+    struct {
+      unsigned long long key;
+    } keyedMutex;
+    unsigned int reserved[12];
+  } params;
+
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalSemaphoreSignalParams;
+
+/**
+ * External semaphore wait parameters, compatible with driver type
+ */
+typedef struct hipExternalSemaphoreWaitParams_st {
+  struct {
+    struct {
+      unsigned long long value;
+    } fence;
+
+    struct {
+      unsigned long long key;
+      unsigned int timeoutMs;
+    } keyedMutex;
+    unsigned int reserved[10];
+  } params;
+
+  unsigned int flags;
+  unsigned int reserved[16];
+} hipExternalSemaphoreWaitParams;
+
+#if __HIP_HAS_GET_PCH
+/**
+ * Internal use only. This API may change in the future
+ * Pre-Compiled header for online compilation
+ *
+ */
+    void __hipGetPCH(const char** pch, unsigned int*size);
+#endif
+
+
+// Doxygen end group GlobalDefs
+/**  @} */
+
+
+//-------------------------------------------------------------------------------------------------
+
+
+// The handle allows the async commands to use the stream even if the parent hipStream_t goes
+// out-of-scope.
+// typedef class ihipStream_t * hipStream_t;
+
+
+/*
+ * Opaque structure allows the true event (pointed at by the handle) to remain "live" even if the
+ * surrounding hipEvent_t goes out-of-scope. This is handy for cases where the hipEvent_t goes
+ * out-of-scope but the true event is being written by some async queue or device */
+// typedef struct hipEvent_t {
+//    struct ihipEvent_t *_handle;
+//} hipEvent_t;
+
+
+/**
+ *  @defgroup API HIP API
+ *  @{
+ *
+ *  Defines the HIP API.  See the individual sections for more information.
+ */
+
+
+/**
+ *  @defgroup Driver Initialization and Version
+ *  @{
+ *  This section describes the initializtion and version functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Explicitly initializes the HIP runtime.
+ *
+ * Most HIP APIs implicitly initialize the HIP runtime.
+ * This API provides control over the timing of the initialization.
+ */
+// TODO-ctx - more description on error codes.
+hipError_t hipInit(unsigned int flags);
+
+/**
+ * @brief Returns the approximate HIP driver version.
+ *
+ * @param [out] driverVersion
+ *
+ * @returns #hipSuccess, #hipErrorInavlidValue
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision.
+ * This function always set *driverVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the driver revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ * @see hipRuntimeGetVersion
+ */
+hipError_t hipDriverGetVersion(int* driverVersion);
+
+/**
+ * @brief Returns the approximate HIP Runtime version.
+ *
+ * @param [out] runtimeVersion
+ *
+ * @returns #hipSuccess, #hipErrorInavlidValue
+ *
+ * @warning On HIP/HCC path this function returns HIP runtime patch version however on
+ * HIP/NVCC path this function return CUDA runtime version.
+ *
+ * @see hipDriverGetVersion
+ */
+hipError_t hipRuntimeGetVersion(int* runtimeVersion);
+
+
+/**
+ * @brief Returns a handle to a compute device
+ * @param [out] device
+ * @param [in] ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
+
+/**
+ * @brief Returns the compute capability of the device
+ * @param [out] major
+ * @param [out] minor
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
+
+/**
+ * @brief Returns an identifer string for the device.
+ * @param [out] name
+ * @param [in] len
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
+
+
+/**
+ * @brief Returns a value for attr of link between two devices
+ * @param [out] value
+ * @param [in] attr
+ * @param [in] srcDevice
+ * @param [in] dstDevice
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                    int srcDevice, int dstDevice);
+
+/**
+ * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
+ * @param [out] pciBusId
+ * @param [in] len
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
+
+
+/**
+ * @brief Returns a handle to a compute device.
+ * @param [out] device handle
+ * @param [in] PCI Bus ID
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
+
+
+/**
+ * @brief Returns the total amount of memory on the device.
+ * @param [out] bytes
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device);
+
+
+// doxygen end initialization
+/**
+ * @}
+ */
+
+/**
+ *  @defgroup Device Device Management
+ *  @{
+ *  This section describes the device management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Waits on all active streams on current device
+ *
+ * When this command is invoked, the host thread gets blocked until all the commands associated
+ * with streams associated with the device. HIP does not support multiple blocking modes (yet!).
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipSetDevice, hipDeviceReset
+ */
+hipError_t hipDeviceSynchronize(void);
+
+
+/**
+ * @brief The state of current device is discarded and updated to a fresh state.
+ *
+ * Calling this function deletes all streams created, memory allocated, kernels running, events
+ * created. Make sure that no other thread is using the device or streams, memory, kernels, events
+ * associated with the current device.
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipDeviceSynchronize
+ */
+hipError_t hipDeviceReset(void);
+
+
+/**
+ * @brief Set default device to be used for subsequent hip API calls from this thread.
+ *
+ * @param[in] deviceId Valid device in range 0...hipGetDeviceCount().
+ *
+ * Sets @p device as the default device for the calling host thread.  Valid device id's are 0...
+ * (hipGetDeviceCount()-1).
+ *
+ * Many HIP APIs implicitly use the "default device" :
+ *
+ * - Any device memory subsequently allocated from this host thread (using hipMalloc) will be
+ * allocated on device.
+ * - Any streams or events created from this host thread will be associated with device.
+ * - Any kernels launched from this host thread (using hipLaunchKernel) will be executed on device
+ * (unless a specific stream is specified, in which case the device associated with that stream will
+ * be used).
+ *
+ * This function may be called from any host thread.  Multiple host threads may use the same device.
+ * This function does no synchronization with the previous or new device, and has very little
+ * runtime overhead. Applications can use hipSetDevice to quickly switch the default device before
+ * making a HIP runtime call which uses the default device.
+ *
+ * The default device is stored in thread-local-storage for each thread.
+ * Thread-pool implementations may inherit the default device of the previous thread.  A good
+ * practice is to always call hipSetDevice at the start of HIP coding sequency to establish a known
+ * standard device.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorDeviceAlreadyInUse
+ *
+ * @see hipGetDevice, hipGetDeviceCount
+ */
+hipError_t hipSetDevice(int deviceId);
+
+
+/**
+ * @brief Return the default device id for the calling host thread.
+ *
+ * @param [out] device *device is written with the default device
+ *
+ * HIP maintains an default device for each thread using thread-local-storage.
+ * This device is used implicitly for HIP runtime APIs called by this thread.
+ * hipGetDevice returns in * @p device the default device for the calling host thread.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see hipSetDevice, hipGetDevicesizeBytes
+ */
+hipError_t hipGetDevice(int* deviceId);
+
+
+/**
+ * @brief Return number of compute-capable devices.
+ *
+ * @param [output] count Returns number of compute-capable devices.
+ *
+ * @returns #hipSuccess, #hipErrorNoDevice
+ *
+ *
+ * Returns in @p *count the number of devices that have ability to run compute commands.  If there
+ * are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. If 1 or more
+ * devices can be found, then hipGetDeviceCount returns #hipSuccess.
+ */
+hipError_t hipGetDeviceCount(int* count);
+
+/**
+ * @brief Query for a specific device attribute.
+ *
+ * @param [out] pi pointer to value to return
+ * @param [in] attr attribute to query
+ * @param [in] deviceId which device to query for information
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId);
+
+/**
+ * @brief Returns device properties.
+ *
+ * @param [out] prop written with device properties
+ * @param [in]  deviceId which device to query for information
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice
+ * @bug HCC always returns 0 for maxThreadsPerMultiProcessor
+ * @bug HCC always returns 0 for regsPerBlock
+ * @bug HCC always returns 0 for l2CacheSize
+ *
+ * Populates hipGetDeviceProperties with information for the specified device.
+ */
+hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
+
+
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * @param [in] cacheConfig
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
+
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [in] cacheConfig
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig);
+
+/**
+ * @brief Get Resource limits of current device
+ *
+ * @param [out] pValue
+ * @param [in]  limit
+ *
+ * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
+ * Note: Currently, only hipLimitMallocHeapSize is available
+ *
+ */
+hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
+
+
+/**
+ * @brief Returns bank width of shared memory for current device
+ *
+ * @param [out] pConfig
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
+
+/**
+ * @brief Gets the flags set for current device
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipGetDeviceFlags(unsigned int* flags);
+
+/**
+ * @brief The bank width of shared memory on current device is set
+ *
+ * @param [in] config
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config);
+
+/**
+ * @brief The current device behavior is changed according the flags passed.
+ *
+ * @param [in] flags
+ *
+ * The schedule flags impact how HIP waits for the completion of a command running on a device.
+ * hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted the
+ * work until the command completes.  This offers the lowest latency, but will consume a CPU core
+ * and may increase power. hipDeviceScheduleYield        : The HIP runtime will yield the CPU to
+ * system so that other tasks can use it.  This may increase latency to detect the completion but
+ * will consume less power and is friendlier to other tasks in the system.
+ * hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
+ * hipDeviceScheduleAuto         : Use a hueristic to select between Spin and Yield modes.  If the
+ * number of HIP contexts is greater than the number of logical processors in the system, use Spin
+ * scheduling.  Else use Yield scheduling.
+ *
+ *
+ * hipDeviceMapHost              : Allow mapping host memory.  On ROCM, this is always allowed and
+ * the flag is ignored. hipDeviceLmemResizeToMax      : @warning ROCm silently ignores this flag.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
+ *
+ *
+ */
+hipError_t hipSetDeviceFlags(unsigned flags);
+
+/**
+ * @brief Device which matches hipDeviceProp_t is returned
+ *
+ * @param [out] device ID
+ * @param [in]  device properties pointer
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop);
+
+/**
+ * @brief Returns the link type and hop count between two devices
+ *
+ * @param [in] device1 Ordinal for device1
+ * @param [in] device2 Ordinal for device2
+ * @param [out] linktype Returns the link type (See hsa_amd_link_info_type_t) between the two devices
+ * @param [out] hopcount Returns the hop count between the two devices
+ *
+ * Queries and returns the HSA link type and the hop count between the two specified devices.
+ *
+ * @returns #hipSuccess, #hipInvalidDevice, #hipErrorRuntimeOther
+ */
+hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount);
+
+
+// TODO: implement IPC apis
+
+/**
+ * @brief Gets an interprocess memory handle for an existing device memory
+ *          allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with hipMalloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with hipFree and a subsequent call
+ * to hipMalloc returns memory with the same device address,
+ * hipIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * @param handle - Pointer to user allocated hipIpcMemHandle to return
+ *                    the handle in.
+ * @param devPtr - Base pointer to previously allocated device memory
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorInvalidHandle,
+ * hipErrorOutOfMemory,
+ * hipErrorMapFailed,
+ *
+ */
+hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr);
+
+/**
+ * @brief Opens an interprocess memory handle exported from another process
+ *          and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with hipIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * hipIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called hipDeviceEnablePeerAccess. This behavior is
+ * controlled by the hipIpcMemLazyEnablePeerAccess flag.
+ * hipDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open hipIpcMemHandles are restricted in the following way.
+ * hipIpcMemHandles from each device in a given process may only be opened
+ * by one context per device per other process.
+ *
+ * Memory returned from hipIpcOpenMemHandle must be freed with
+ * hipIpcCloseMemHandle.
+ *
+ * Calling hipFree on an exported memory region before calling
+ * hipIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * @param devPtr - Returned device pointer
+ * @param handle - hipIpcMemHandle to open
+ * @param flags  - Flags for this operation. Must be specified as hipIpcMemLazyEnablePeerAccess
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorMapFailed,
+ * hipErrorInvalidHandle,
+ * hipErrorTooManyPeers
+ *
+ * @note No guarantees are made about the address returned in @p *devPtr.
+ * In particular, multiple processes may not receive the same address for the same @p handle.
+ *
+ */
+hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags);
+
+/**
+ * @brief Close memory mapped with hipIpcOpenMemHandle
+ *
+ * Unmaps memory returnd by hipIpcOpenMemHandle. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * @param devPtr - Device pointer returned by hipIpcOpenMemHandle
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorMapFailed,
+ * hipErrorInvalidHandle,
+ *
+ */
+hipError_t hipIpcCloseMemHandle(void* devPtr);
+
+
+hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
+hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle);
+
+// end doxygen Device
+/**
+ * @}
+ */
+
+/**
+ *
+ *  @defgroup Execution Execution Control
+ *  @{
+ *  This section describes the execution control functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief Set attribute for a specific function
+ *
+ * @param [in] func;
+ * @param [in] attr;
+ * @param [in] value;
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value);
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [in] config;
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
+
+/**
+ * @brief Set shared memory configuation for a specific function
+ *
+ * @param [in] func
+ * @param [in] config
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config);
+
+//doxygen end execution
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Error Error Handling
+ *  @{
+ *  This section describes the error handling functions of HIP runtime API.
+ */
+
+/**
+ * @brief Return last error returned by any HIP runtime API call and resets the stored error code to
+ * #hipSuccess
+ *
+ * @returns return code from last HIP called from the active host thread
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread, and then resets the saved error to #hipSuccess.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipGetLastError(void);
+
+
+/**
+ * @brief Return last error returned by any HIP runtime API call.
+ *
+ * @return #hipSuccess
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread. Unlike hipGetLastError, this function does not reset the saved error code.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipPeekAtLastError(void);
+
+
+/**
+ * @brief Return name of the specified error code in text form.
+ *
+ * @param hip_error Error code to convert to name.
+ * @return const char pointer to the NULL-terminated error name
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorName(hipError_t hip_error);
+
+
+/**
+ * @brief Return handy text string message to explain the error which occurred
+ *
+ * @param hipError Error code to convert to string.
+ * @return const char pointer to the NULL-terminated error string
+ *
+ * @warning : on HCC, this function returns the name of the error (same as hipGetErrorName)
+ *
+ * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorString(hipError_t hipError);
+
+// end doxygen Error
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Stream Stream Management
+ *  @{
+ *  This section describes the stream management functions of HIP runtime API.
+ *  The following Stream APIs are not (yet) supported in HIP:
+ *  - cudaStreamAttachMemAsync
+ */
+
+
+/**
+ * @brief Create an asynchronous stream.
+ *
+ * @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
+ * newly created stream.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
+ * reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
+ * the heap and will remain allocated even if the handle goes out-of-scope.  To release the memory
+ * used by the stream, applicaiton must call hipStreamDestroy.
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipStreamCreate(hipStream_t* stream);
+
+
+/**
+ * @brief Create an asynchronous stream.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] flags to control stream creation.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
+ * reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
+ * the heap and will remain allocated even if the handle goes out-of-scope.  To release the memory
+ * used by the stream, applicaiton must call hipStreamDestroy. Flags controls behavior of the
+ * stream.  See #hipStreamDefault, #hipStreamNonBlocking.
+ *
+ *
+ * @see hipStreamCreate, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+
+hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
+
+
+/**
+ * @brief Create an asynchronous stream with the specified priority.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] flags to control stream creation.
+ * @param[in ] priority of the stream. Lower numbers represent higher priorities.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream with the specified priority.  @p stream returns an opaque handle
+ * that can be used to reference the newly created stream in subsequent hipStream* commands.  The
+ * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
+ * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls
+ * behavior of the stream.  See #hipStreamDefault, #hipStreamNonBlocking.
+ *
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
+
+
+/**
+ * @brief Returns numerical values that correspond to the least and greatest stream priority.
+ *
+ * @param[in, out] leastPriority pointer in which value corresponding to least priority is returned.
+ * @param[in, out] greatestPriority pointer in which value corresponding to greatest priority is returned.
+ *
+ * Returns in *leastPriority and *greatestPriority the numerical values that correspond to the least
+ * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers
+ * imply greater priorities. The range of meaningful stream priorities is given by
+ * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value
+ * that is outside the the meaningful range as specified by this API, the priority is automatically
+ * clamped to within the valid range.
+ */
+
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+
+
+/**
+ * @brief Destroys the specified stream.
+ *
+ * @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
+ * newly created stream.
+ * @return #hipSuccess #hipErrorInvalidHandle
+ *
+ * Destroys the specified stream.
+ *
+ * If commands are still executing on the specified stream, some may complete execution before the
+ * queue is deleted.
+ *
+ * The queue may be destroyed while some commands are still inflight, or may wait for all commands
+ * queued to the stream before destroying it.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamQuery, hipStreamWaitEvent,
+ * hipStreamSynchronize
+ */
+hipError_t hipStreamDestroy(hipStream_t stream);
+
+
+/**
+ * @brief Return #hipSuccess if all of the operations in the specified @p stream have completed, or
+ * #hipErrorNotReady if not.
+ *
+ * @param[in] stream stream to query
+ *
+ * @return #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
+ *
+ * This is thread-safe and returns a snapshot of the current state of the queue.  However, if other
+ * host threads are sending work to the stream, the status may change immediately after the function
+ * is called.  It is typically used for debug.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamSynchronize,
+ * hipStreamDestroy
+ */
+hipError_t hipStreamQuery(hipStream_t stream);
+
+
+/**
+ * @brief Wait for all commands in stream to complete.
+ *
+ * @param[in] stream stream identifier.
+ *
+ * @return #hipSuccess, #hipErrorInvalidHandle
+ *
+ * This command is host-synchronous : the host will block until the specified stream is empty.
+ *
+ * This command follows standard null-stream semantics.  Specifically, specifying the null stream
+ * will cause the command to wait for other streams on the same device to complete all pending
+ * operations.
+ *
+ * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active
+ * or blocking.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamDestroy
+ *
+ */
+hipError_t hipStreamSynchronize(hipStream_t stream);
+
+
+/**
+ * @brief Make the specified compute stream wait for an event
+ *
+ * @param[in] stream stream to make wait.
+ * @param[in] event event to wait on
+ * @param[in] flags control operation [must be 0]
+ *
+ * @return #hipSuccess, #hipErrorInvalidHandle
+ *
+ * This function inserts a wait operation into the specified stream.
+ * All future work submitted to @p stream will wait until @p event reports completion before
+ * beginning execution.
+ *
+ * This function only waits for commands in the current stream to complete.  Notably,, this function
+ * does not impliciy wait for commands in the default stream to complete, even if the specified
+ * stream is created with hipStreamNonBlocking = 0.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy
+ */
+hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
+
+
+/**
+ * @brief Return flags associated with this stream.
+ *
+ * @param[in] stream stream to be queried
+ * @param[in,out] flags Pointer to an unsigned integer in which the stream's flags are returned
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * Return flags associated with this stream in *@p flags.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags);
+
+
+/**
+ * @brief Query the priority of a stream.
+ *
+ * @param[in] stream stream to be queried
+ * @param[in,out] priority Pointer to an unsigned integer in which the stream's priority is returned
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * Query the priority of a stream. The priority is returned in in priority.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority);
+
+
+/**
+ * @brief Create an asynchronous stream with the specified CU mask.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] cuMaskSize Size of CU mask bit array passed in.
+ * @param[in ] cuMask Bit-vector representing the CU mask. Each active bit represents using one CU.
+ * The first 32 bits represent the first 32 CUs, and so on. If its size is greater than physical
+ * CU number (i.e., multiProcessorCount member of hipDeviceProp_t), the extra elements are ignored.
+ * It is user's responsibility to make sure the input is meaningful.
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream with the specified CU mask.  @p stream returns an opaque handle
+ * that can be used to reference the newly created stream in subsequent hipStream* commands.  The
+ * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
+ * To release the memory used by the stream, application must call hipStreamDestroy.
+ *
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize, const uint32_t* cuMask);
+
+
+/**
+ * @brief Get CU mask associated with an asynchronous stream
+ *
+ * @param[in] stream stream to be queried
+ * @param[in] cuMaskSize number of the block of memories (uint32_t *) allocated by user
+ * @param[out] cuMask Pointer to a pre-allocated block of memories (uint32_t *) in which
+ * the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where
+ * each active bit represents one active CU
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask);
+
+/**
+ * Stream CallBack struct
+ */
+typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+
+/**
+ * @brief Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cudaStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ * @param[in] stream   - Stream to add callback to
+ * @param[in] callback - The function to call once preceding stream operations are complete
+ * @param[in] userData - User specified data to be passed to the callback function
+ * @param[in] flags    - Reserved for future use, must be 0
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
+ * hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
+ *
+ */
+hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+                                unsigned int flags);
+
+
+// end doxygen Stream
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Stream Memory Operations
+ *  @{
+ *  This section describes Stream Memory Wait and Write functions of HIP runtime API.
+ */
+
+
+/**
+ * @brief Enqueues a wait command to the stream.
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
+ * @param [in] value  - Value to be used in compare operation
+ * @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte
+ * hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor
+ * @param [in] mask   - Mask to be applied on value at memory before it is compared with value,
+ * default value is set to enable every bit
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
+ * not execute until the defined wait condition is true.
+ *
+ * hipStreamWaitValueGte: waits until *ptr&mask >= value
+ * hipStreamWaitValueEq : waits until *ptr&mask == value
+ * hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+ * hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+ *
+ * @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.
+ *
+ * @note Support for hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and
+ * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,
+ * hipStreamWriteValue32, hipDeviceGetAttribute
+ */
+
+hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags,
+                                uint32_t mask __dparm(0xFFFFFFFF));
+
+/**
+ * @brief Enqueues a wait command to the stream.
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
+ * @param [in] value  - Value to be used in compare operation
+ * @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte
+ * hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor.
+ * @param [in] mask   - Mask to be applied on value at memory before it is compared with value
+ * default value is set to enable every bit
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
+ * not execute until the defined wait condition is true.
+ *
+ * hipStreamWaitValueGte: waits until *ptr&mask >= value
+ * hipStreamWaitValueEq : waits until *ptr&mask == value
+ * hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
+ * hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
+ *
+ * @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.
+ *
+ * @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and
+ * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,
+ * hipStreamWriteValue32, hipDeviceGetAttribute
+ */
+
+hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags,
+                                uint64_t mask __dparm(0xFFFFFFFFFFFFFFFF));
+
+/**
+ * @brief Enqueues a write command to the stream.
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to a GPU accessible memory object
+ * @param [in] value  - Value to be written
+ * @param [in] flags  - reserved, ignored for now, will be used in future releases
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a write command to the stream, write operation is performed after all earlier commands
+ * on this stream have completed the execution.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64
+ */
+hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags);
+
+/**
+ * @brief Enqueues a write command to the stream.
+ *
+ * @param [in] stream - Stream identifier
+ * @param [in] ptr    - Pointer to a GPU accessible memory object
+ * @param [in] value  - Value to be written
+ * @param [in] flags  - reserved, ignored for now, will be used in future releases
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * Enqueues a write command to the stream, write operation is performed after all earlier commands
+ * on this stream have completed the execution.
+ *
+ * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
+ * hipStreamWaitValue64
+ */
+hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags);
+
+
+// end doxygen Stream Memory Operations
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Event Event Management
+ *  @{
+ *  This section describes the event management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Create an event with the specified flags
+ *
+ * @param[in,out] event Returns the newly created event.
+ * @param[in] flags     Flags to control event behavior.  Valid values are #hipEventDefault,
+ #hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess
+
+ * #hipEventDefault : Default flag.  The event will use active synchronization and will support
+ timing.  Blocking synchronization provides lowest possible latency at the expense of dedicating a
+ CPU to poll on the event.
+ * #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is
+ called on this event, the thread will block until the event completes.  This can increase latency
+ for the synchroniation but can result in lower power and more resources for other CPU threads.
+ * #hipEventDisableTiming : Disable recording of timing information. Events created with this flag
+ would not record profiling data and provide best performance if used for synchronization.
+
+ * @warning On AMD platform, hipEventInterprocess support is under development.  Use of this flag
+ will return an error.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreate, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
+
+
+/**
+ *  Create an event
+ *
+ * @param[in,out] event Returns the newly created event.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreateWithFlags, hipEventRecord, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreate(hipEvent_t* event);
+
+
+/**
+ * @brief Record an event in the specified stream.
+ *
+ * @param[in] event event to record.
+ * @param[in] stream stream in which to record event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ * hipEventQuery() or hipEventSynchronize() must be used to determine when the event
+ * transitions from "recording" (after hipEventRecord() is called) to "recorded"
+ * (when timestamps are set, if requested).
+ *
+ * Events which are recorded in a non-NULL stream will transition to
+ * from recording to "recorded" state when they reach the head of
+ * the specified stream, after all previous
+ * commands in that stream have completed executing.
+ *
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any
+ * existing state in event.
+ *
+ * If this function is called on an event that is currently being recorded, results are undefined
+ * - either outstanding recording may save state into the event, and the order is not guaranteed.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ *
+ */
+#ifdef __cplusplus
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
+#else
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
+#endif
+
+/**
+ *  @brief Destroy the specified event.
+ *
+ *  @param[in] event Event to destroy.
+ *  @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure
+ *
+ *  Releases memory associated with the event.  If the event is recording but has not completed
+ * recording when hipEventDestroy() is called, the function will return immediately and the
+ * completion_future resources will be released later, when the hipDevice is synchronized.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventRecord,
+ * hipEventElapsedTime
+ *
+ * @returns #hipSuccess
+ */
+hipError_t hipEventDestroy(hipEvent_t event);
+
+
+/**
+ *  @brief Wait for an event to complete.
+ *
+ *  This function will block until the event is ready, waiting for all previous work in the stream
+ * specified when event was recorded with hipEventRecord().
+ *
+ *  If hipEventRecord() has not been called on @p event, this function returns immediately.
+ *
+ *  TODO-hip- This function needs to support hipEventBlockingSync parameter.
+ *
+ *  @param[in] event Event on which to wait.
+ *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ *  @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventElapsedTime
+ */
+hipError_t hipEventSynchronize(hipEvent_t event);
+
+
+/**
+ * @brief Return the elapsed time between two events.
+ *
+ * @param[out] ms : Return time between start and stop in ms.
+ * @param[in]   start : Start event.
+ * @param[in]   stop  : Stop event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotReady, #hipErrorInvalidHandle,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Computes the elapsed time between two events. Time is computed in ms, with
+ * a resolution of approximately 1 us.
+ *
+ * Events which are recorded in a NULL stream will block until all commands
+ * on all other streams complete execution, and then record the timestamp.
+ *
+ * Events which are recorded in a non-NULL stream will record their timestamp
+ * when they reach the head of the specified stream, after all previous
+ * commands in that stream have completed executing.  Thus the time that
+ * the event recorded may be significantly after the host calls hipEventRecord().
+ *
+ * If hipEventRecord() has not been called on either event, then #hipErrorInvalidHandle is
+ * returned. If hipEventRecord() has been called on both events, but the timestamp has not yet been
+ * recorded on one or both events (that is, hipEventQuery() would return #hipErrorNotReady on at
+ * least one of the events), then #hipErrorNotReady is returned.
+ *
+ * Note, for HIP Events used in kernel dispatch using hipExtLaunchKernelGGL/hipExtLaunchKernel,
+ * events passed in hipExtLaunchKernelGGL/hipExtLaunchKernel are not explicitly recorded and should
+ * only be used to get elapsed time for that specific launch. In case events are used across
+ * multiple dispatches, for example, start and stop events from different hipExtLaunchKernelGGL/
+ * hipExtLaunchKernel calls, they will be treated as invalid unrecorded events, HIP will throw
+ * error "hipErrorInvalidHandle" from hipEventElapsedTime.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventSynchronize
+ */
+hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop);
+
+
+/**
+ * @brief Query event status
+ *
+ * @param[in] event Event to query.
+ * @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle, #hipErrorInvalidValue,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Query the status of the specified event.  This function will return #hipErrorNotReady if all
+ * commands in the appropriate stream (specified to hipEventRecord()) have completed.  If that work
+ * has not completed, or if hipEventRecord() was not called on the event, then #hipSuccess is
+ * returned.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
+ * hipEventSynchronize, hipEventElapsedTime
+ */
+hipError_t hipEventQuery(hipEvent_t event);
+
+
+// end doxygen Events
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Memory Memory Management
+ *  @{
+ *  This section describes the memory management functions of HIP runtime API.
+ *  The following CUDA APIs are not currently supported:
+ *  - cudaMalloc3D
+ *  - cudaMalloc3DArray
+ *  - TODO - more 2D, 3D, array APIs here.
+ *
+ *
+ */
+
+/**
+ *  @brief Return attributes for the specified pointer
+ *
+ *  @param[out] attributes for the specified pointer
+ *  @param[in]  pointer to get attributes for
+ *
+ *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice
+ */
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr);
+
+
+
+/**
+ *  @brief Imports an external semaphore.
+ *
+ *  @param[out] extSem_out  External semaphores to be waited on
+ *  @param[in] semHandleDesc Semaphore import handle descriptor
+ *
+ *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
+                                      const hipExternalSemaphoreHandleDesc* semHandleDesc);
+
+
+
+/**
+ *  @brief Signals a set of external semaphore objects.
+ *
+ *  @param[in] extSem_out  External semaphores to be waited on
+ *  @param[in] paramsArray Array of semaphore parameters
+ *  @param[in] numExtSems Number of semaphores to wait on
+ *  @param[in] stream Stream to enqueue the wait operations in
+ *
+ *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+
+hipError_t hipSignalExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                            const hipExternalSemaphoreSignalParams* paramsArray,
+                                            unsigned int numExtSems, hipStream_t stream);
+
+
+
+/**
+ *  @brief Waits on a set of external semaphore objects
+ *
+ *  @param[in] extSem_out  External semaphores to be waited on
+ *  @param[in] paramsArray Array of semaphore parameters
+ *  @param[in] numExtSems Number of semaphores to wait on
+ *  @param[in] stream Stream to enqueue the wait operations in
+ *
+ *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                              const hipExternalSemaphoreWaitParams* paramsArray,
+                                              unsigned int numExtSems, hipStream_t stream);
+
+
+
+/**
+ *  @brief Destroys an external semaphore object and releases any references to the underlying resource. Any outstanding signals or waits must have completed before the semaphore is destroyed.
+
+ *
+ *  @param[in] extSem handle to an external memory object
+
+ *
+ *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ *  @see
+ */
+hipError_t hipDestroyExternalSemaphore(hipExternalSemaphore_t extSem);
+
+ 
+/**
+*  @brief Imports an external memory object.
+*
+*  @param[out] extMem_out  Returned handle to an external memory object
+*  @param[in]  memHandleDesc Memory import handle descriptor
+*
+*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+*
+*  @see
+*/
+hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out, const hipExternalMemoryHandleDesc* memHandleDesc);
+
+/**
+*  @brief Maps a buffer onto an imported memory object.
+*
+*  @param[out] devPtr Returned device pointer to buffer
+*  @param[in]  extMem  Handle to external memory object
+*  @param[in]  bufferDesc  Buffer descriptor
+*
+*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+*
+*  @see
+*/
+hipError_t hipExternalMemoryGetMappedBuffer(void **devPtr, hipExternalMemory_t extMem, const hipExternalMemoryBufferDesc *bufferDesc);
+
+
+/**
+*  @brief Destroys an external memory object.
+*
+*  @param[in] extMem  External memory object to be destroyed
+*
+*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+*
+*  @see
+*/
+hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem);
+
+/**
+ *  @brief Allocate memory on the default accelerator
+ *
+ *  @param[out] ptr Pointer to the allocated memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hipHostMalloc
+ */
+hipError_t hipMalloc(void** ptr, size_t size);
+
+/**
+ *  @brief Allocate memory on the default accelerator
+ *
+ *  @param[out] ptr Pointer to the allocated memory
+ *  @param[in]  size Requested memory size
+ *  @param[in]  flags Type of memory allocation
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hipHostMalloc
+ */
+hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags);
+
+/**
+ *  @brief Allocate pinned host memory [Deprecated]
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMallocHost(void** ptr, size_t size);
+
+/**
+ *  @brief Allocate pinned host memory [Deprecated]
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMemAllocHost(void** ptr, size_t size);
+
+/**
+ *  @brief Allocate device accessible page locked host memory
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *  @param[in]  flags Type of host memory allocation
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipSetDeviceFlags, hipHostFree
+ */
+hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags);
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @addtogroup MemoryM Managed Memory (ROCm HMM)
+ *  @{
+ *  @ingroup Memory
+ *  This section describes the managed memory management functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Allocates memory that will be automatically managed by AMD HMM.
+ *
+ * @param [out] dev_ptr - pointer to allocated device memory
+ * @param [in]  size    - requested allocation size in bytes
+ * @param [in]  flags   - must be either hipMemAttachGlobal or hipMemAttachHost
+ *                        (defaults to hipMemAttachGlobal)
+ *
+ * @returns #hipSuccess, #hipErrorMemoryAllocation, #hipErrorNotSupported, #hipErrorInvalidValue
+ */
+hipError_t hipMallocManaged(void** dev_ptr,
+                            size_t size,
+                            unsigned int flags __dparm(hipMemAttachGlobal));
+
+/**
+ * @brief Prefetches memory to the specified destination device using AMD HMM.
+ *
+ * @param [in] dev_ptr  pointer to be prefetched
+ * @param [in] count    size in bytes for prefetching
+ * @param [in] device   destination device to prefetch to
+ * @param [in] stream   stream to enqueue prefetch operation
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemPrefetchAsync(const void* dev_ptr,
+                               size_t count,
+                               int device,
+                               hipStream_t stream __dparm(0));
+
+/**
+ * @brief Advise about the usage of a given memory range to AMD HMM.
+ *
+ * @param [in] dev_ptr  pointer to memory to set the advice for
+ * @param [in] count    size in bytes of the memory range
+ * @param [in] advice   advice to be applied for the specified memory range
+ * @param [in] device   device to apply the advice for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemAdvise(const void* dev_ptr,
+                        size_t count,
+                        hipMemoryAdvise advice,
+                        int device);
+
+/**
+ * @brief Query an attribute of a given memory range in AMD HMM.
+ *
+ * @param [in/out] data   a pointer to a memory location where the result of each
+ *                        attribute query will be written to
+ * @param [in] data_size  the size of data
+ * @param [in] attribute  the attribute to query
+ * @param [in] dev_ptr    start of the range to query
+ * @param [in] count      size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemRangeGetAttribute(void* data,
+                                   size_t data_size,
+                                   hipMemRangeAttribute attribute,
+                                   const void* dev_ptr,
+                                   size_t count);
+
+/**
+ * @brief Query attributes of a given memory range in AMD HMM.
+ *
+ * @param [in/out] data     a two-dimensional array containing pointers to memory locations
+ *                          where the result of each attribute query will be written to
+ * @param [in] data_sizes   an array, containing the sizes of each result
+ * @param [in] attributes   the attribute to query
+ * @param [in] num_attributes  an array of attributes to query (numAttributes and the number
+ *                          of attributes in this array should match)
+ * @param [in] dev_ptr      start of the range to query
+ * @param [in] count        size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemRangeGetAttributes(void** data,
+                                    size_t* data_sizes,
+                                    hipMemRangeAttribute* attributes,
+                                    size_t num_attributes,
+                                    const void* dev_ptr,
+                                    size_t count);
+
+/**
+ * @brief Attach memory to a stream asynchronously in AMD HMM.
+ *
+ * @param [in] stream     - stream in which to enqueue the attach operation
+ * @param [in] dev_ptr    - pointer to memory (must be a pointer to managed memory or
+ *                          to a valid host-accessible region of system-allocated memory)
+ * @param [in] length     - length of memory (defaults to zero)
+ * @param [in] flags      - must be one of cudaMemAttachGlobal, cudaMemAttachHost or
+ *                          cudaMemAttachSingle (defaults to cudaMemAttachSingle)
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipStreamAttachMemAsync(hipStream_t stream,
+                                   hipDeviceptr_t* dev_ptr,
+                                   size_t length __dparm(0),
+                                   unsigned int flags __dparm(hipMemAttachSingle));
+
+// end doxygen Managed Memory
+/**
+ * @}
+ */
+
+/**
+ *  @brief Allocate device accessible page locked host memory [Deprecated]
+ *
+ *  @param[out] ptr Pointer to the allocated host pinned memory
+ *  @param[in]  size Requested memory size
+ *  @param[in]  flags Type of host memory allocation
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags);
+
+/**
+ *  @brief Get Device pointer from Host Pointer allocated through hipHostMalloc
+ *
+ *  @param[out] dstPtr Device Pointer mapped to passed host pointer
+ *  @param[in]  hstPtr Host Pointer allocated through hipHostMalloc
+ *  @param[in]  flags Flags to be passed for extension
+ *
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ *  @see hipSetDeviceFlags, hipHostMalloc
+ */
+hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int flags);
+
+/**
+ *  @brief Return flags associated with host pointer
+ *
+ *  @param[out] flagsPtr Memory location to store flags
+ *  @param[in]  hostPtr Host Pointer allocated through hipHostMalloc
+ *  @return #hipSuccess, #hipErrorInvalidValue
+ *
+ *  @see hipHostMalloc
+ */
+hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr);
+
+/**
+ *  @brief Register host memory so it can be accessed from the current device.
+ *
+ *  @param[out] hostPtr Pointer to host memory to be registered.
+ *  @param[in] sizeBytes size of the host memory
+ *  @param[in] flags.  See below.
+ *
+ *  Flags:
+ *  - #hipHostRegisterDefault   Memory is Mapped and Portable
+ *  - #hipHostRegisterPortable  Memory is considered registered by all contexts.  HIP only supports
+ * one context so this is always assumed true.
+ *  - #hipHostRegisterMapped    Map the allocation into the address space for the current device.
+ * The device pointer can be obtained with #hipHostGetDevicePointer.
+ *
+ *
+ *  After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
+ *  On many systems, the mapped device pointer will have a different value than the mapped host
+ * pointer.  Applications must use the device pointer in device code, and the host pointer in device
+ * code.
+ *
+ *  On some systems, registered memory is pinned.  On some systems, registered memory may not be
+ * actually be pinned but uses OS or hardware facilities to all GPU access to the host memory.
+ *
+ *  Developers are strongly encouraged to register memory blocks which are aligned to the host
+ * cache-line size. (typically 64-bytes but can be obtains from the CPUID instruction).
+ *
+ *  If registering non-aligned pointers, the application must take care when register pointers from
+ * the same cache line on different devices.  HIP's coarse-grained synchronization model does not
+ * guarantee correct results if different devices write to different parts of the same cache block -
+ * typically one of the writes will "win" and overwrite data from the other registered memory
+ * region.
+ *
+ *  @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer
+ */
+hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
+
+/**
+ *  @brief Un-register host pointer
+ *
+ *  @param[in] hostPtr Host pointer previously registered with #hipHostRegister
+ *  @return Error code
+ *
+ *  @see hipHostRegister
+ */
+hipError_t hipHostUnregister(void* hostPtr);
+
+/**
+ *  Allocates at least width (in bytes) * height bytes of linear memory
+ *  Padding may occur to ensure alighnment requirements are met for the given row
+ *  The change in width size due to padding will be returned in *pitch.
+ *  Currently the alignment is set to 128 bytes
+ *
+ *  @param[out] ptr Pointer to the allocated device memory
+ *  @param[out] pitch Pitch for allocation (in bytes)
+ *  @param[in]  width Requested pitched allocation width (in bytes)
+ *  @param[in]  height Requested pitched allocation height
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ *  @return Error code
+ *
+ *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+
+hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height);
+
+/**
+ *  Allocates at least width (in bytes) * height bytes of linear memory
+ *  Padding may occur to ensure alighnment requirements are met for the given row
+ *  The change in width size due to padding will be returned in *pitch.
+ *  Currently the alignment is set to 128 bytes
+ *
+ *  @param[out] dptr Pointer to the allocated device memory
+ *  @param[out] pitch Pitch for allocation (in bytes)
+ *  @param[in]  width Requested pitched allocation width (in bytes)
+ *  @param[in]  height Requested pitched allocation height
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *  The intended usage of pitch is as a separate parameter of the allocation, used to compute addresses within the 2D array.
+ *  Given the row and column of an array element of type T, the address is computed as:
+ *  T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ *
+ *  @return Error code
+ *
+ *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+
+hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes, size_t height, unsigned int elementSizeBytes);
+
+/**
+ *  @brief Free memory allocated by the hcc hip memory allocation API.
+ *  This API performs an implicit hipDeviceSynchronize() call.
+ *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @return #hipSuccess
+ *  @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
+ * with hipHostMalloc)
+ *
+ *  @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipFree(void* ptr);
+
+/**
+ *  @brief Free memory allocated by the hcc hip host memory allocation API.  [Deprecated]
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @return #hipSuccess,
+ *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
+ hipMalloc)
+
+ *  @deprecated use hipHostFree() instead
+ */
+DEPRECATED("use hipHostFree instead")
+hipError_t hipFreeHost(void* ptr);
+
+/**
+ *  @brief Free memory allocated by the hcc hip host memory allocation API
+ *  This API performs an implicit hipDeviceSynchronize() call.
+ *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ *  @param[in] ptr Pointer to memory to be freed
+ *  @return #hipSuccess,
+ *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
+ * hipMalloc)
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipHostFree(void* ptr);
+
+/**
+ *  @brief Copy data from src to dst.
+ *
+ *  It supports memory from host to device,
+ *  device to host, device to device and host to host
+ *  The src and dst must not overlap.
+ *
+ *  For hipMemcpy, the copy is always performed by the current device (set by hipSetDevice).
+ *  For multi-gpu or peer-to-peer configurations, it is recommended to set the current device to the
+ *  device where the src data is physically located. For optimal peer-to-peer copies, the copy device
+ *  must be able to access the src and dst pointers (by calling hipDeviceEnablePeerAccess with copy
+ *  agent as the current device and src/dest as the peerDevice argument.  if this is not done, the
+ *  hipMemcpy will still work, but will perform the copy using a staging buffer on the host.
+ *  Calling hipMemcpy with dst and src pointers that do not match the hipMemcpyKind results in
+ *  undefined behavior.
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]  src Data being copy from
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  copyType Memory copy type
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknowni
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+
+// TODO: Add description
+hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
+                               hipMemcpyKind kind, hipStream_t stream);
+/**
+ *  @brief Copy data from Host to Device
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
+
+/**
+ *  @brief Copy data from Device to Host
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
+
+/**
+ *  @brief Copy data from Device to Device
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
+
+/**
+ *  @brief Copy data from Host to Device asynchronously
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream);
+
+/**
+ *  @brief Copy data from Device to Host asynchronously
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream);
+
+/**
+ *  @brief Copy data from Device to Device asynchronously
+ *
+ *  @param[out]  dst Data being copy to
+ *  @param[in]   src Data being copy from
+ *  @param[in]   sizeBytes Data size in bytes
+ *
+ *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
+                              hipStream_t stream);
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+    hipModule_t hmod, const char* name);
+
+hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
+hipError_t hipGetSymbolSize(size_t* size, const void* symbol);
+hipError_t hipMemcpyToSymbol(const void* symbol, const void* src,
+                             size_t sizeBytes, size_t offset __dparm(0),
+                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
+hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+                                  size_t sizeBytes, size_t offset,
+                                  hipMemcpyKind kind, hipStream_t stream __dparm(0));
+hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol,
+                               size_t sizeBytes, size_t offset __dparm(0),
+                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol,
+                                    size_t sizeBytes, size_t offset,
+                                    hipMemcpyKind kind,
+                                    hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copy data from src to dst asynchronously.
+ *
+ *  @warning If host or dest are not pinned, the memory copy will be performed synchronously.  For
+ * best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+ *
+ *  @warning on HCC hipMemcpyAsync does not support overlapped H2D and D2H copies.
+ *  For hipMemcpy, the copy is always performed by the device associated with the specified stream.
+ *
+ *  For multi-gpu or peer-to-peer configurations, it is recommended to use a stream which is a
+ * attached to the device where the src data is physically located. For optimal peer-to-peer copies,
+ * the copy device must be able to access the src and dst pointers (by calling
+ * hipDeviceEnablePeerAccess with copy agent as the current device and src/dest as the peerDevice
+ * argument.  if this is not done, the hipMemcpy will still work, but will perform the copy using a
+ * staging buffer on the host.
+ *
+ *  @param[out] dst Data being copy to
+ *  @param[in]  src Data being copy from
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  accelerator_view Accelerator view which the copy is being enqueued
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyToSymbol,
+ * hipMemcpyFromSymbol, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
+ * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                          hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ *  @param[out] dst Data being filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  sizeBytes Data size in bytes
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ *  @param[out] dst Data ptr to be filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  number of values to be set
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ * hipMemsetD8Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Data ptr to be filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  number of values to be set
+ *  @param[in]  stream - Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ *  @param[out] dst Data ptr to be filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  number of values to be set
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ * hipMemsetD16Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Data ptr to be filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  number of values to be set
+ *  @param[in]  stream - Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills the memory area pointed to by dest with the constant integer
+ * value for specified number of times.
+ *
+ *  @param[out] dst Data being filled
+ *  @param[in]  constant value to be set
+ *  @param[in]  number of values to be set
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
+
+/**
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
+ * byte value value.
+ *
+ *  hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Pointer to device memory
+ *  @param[in]  value - Value to set for each byte of specified memory
+ *  @param[in]  sizeBytes - Size in bytes to set
+ *  @param[in]  stream - Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills the memory area pointed to by dev with the constant integer
+ * value for specified number of times.
+ *
+ *  hipMemsetD32Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ *  @param[out] dst Pointer to device memory
+ *  @param[in]  value - Value to set for each byte of specified memory
+ *  @param[in]  count - number of values to be set
+ *  @param[in]  stream - Stream identifier
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
+                             hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills the memory area pointed to by dst with the constant value.
+ *
+ *  @param[out] dst Pointer to device memory
+ *  @param[in]  pitch - data size in bytes
+ *  @param[in]  value - constant value to be set
+ *  @param[in]  width
+ *  @param[in]  height
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+
+hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height);
+
+/**
+ *  @brief Fills asynchronously the memory area pointed to by dst with the constant value.
+ *
+ *  @param[in]  dst Pointer to device memory
+ *  @param[in]  pitch - data size in bytes
+ *  @param[in]  value - constant value to be set
+ *  @param[in]  width
+ *  @param[in]  height
+ *  @param[in]  stream
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+
+hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height,hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Fills synchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ *  @param[in] pitchedDevPtr
+ *  @param[in]  value - constant value to be set
+ *  @param[in]  extent
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent );
+
+/**
+ *  @brief Fills asynchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ *  @param[in] pitchedDevPtr
+ *  @param[in]  value - constant value to be set
+ *  @param[in]  extent
+ *  @param[in]  stream
+ *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ,hipStream_t stream __dparm(0));
+
+/**
+ * @brief Query memory info.
+ * Return snapshot of free memory, and total allocatable memory on the device.
+ *
+ * Returns in *free a snapshot of the current free memory.
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ * @warning On HCC, the free memory only accounts for memory allocated by this process and may be
+ *optimistic.
+ **/
+hipError_t hipMemGetInfo(size_t* free, size_t* total);
+
+
+hipError_t hipMemPtrGetInfo(void* ptr, size_t* size);
+
+
+/**
+ *  @brief Allocate an array on the device.
+ *
+ *  @param[out]  array  Pointer to allocated array in device memory
+ *  @param[in]   desc   Requested channel format
+ *  @param[in]   width  Requested array allocation width
+ *  @param[in]   height Requested array allocation height
+ *  @param[in]   flags  Requested properties of allocated array
+ *  @return      #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, size_t width,
+                          size_t height __dparm(0), unsigned int flags __dparm(hipArrayDefault));
+hipError_t hipArrayCreate(hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray);
+
+hipError_t hipArrayDestroy(hipArray* array);
+
+hipError_t hipArray3DCreate(hipArray** array, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray);
+
+hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent);
+
+/**
+ *  @brief Frees an array on the device.
+ *
+ *  @param[in]  array  Pointer to array to free
+ *  @return     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipFreeArray(hipArray* array);
+
+/**
+ * @brief Frees a mipmapped array on the device
+ *
+ * @param[in] mipmappedArray - Pointer to mipmapped array to free
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
+
+/**
+ *  @brief Allocate an array on the device.
+ *
+ *  @param[out]  array  Pointer to allocated array in device memory
+ *  @param[in]   desc   Requested channel format
+ *  @param[in]   extent Requested array allocation width, height and depth
+ *  @param[in]   flags  Requested properties of allocated array
+ *  @return      #hipSuccess, #hipErrorOutOfMemory
+ *
+ *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+
+hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc* desc,
+                            struct hipExtent extent, unsigned int flags);
+
+/**
+ * @brief Allocate a mipmapped array on the device
+ *
+ * @param[out] mipmappedArray  - Pointer to allocated mipmapped array in device memory
+ * @param[in]  desc            - Requested channel format
+ * @param[in]  extent          - Requested allocation size (width field in elements)
+ * @param[in]  numLevels       - Number of mipmap levels to allocate
+ * @param[in]  flags           - Flags for extensions
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
+ */
+hipError_t hipMallocMipmappedArray(
+    hipMipmappedArray_t *mipmappedArray,
+    const struct hipChannelFormatDesc* desc,
+    struct hipExtent extent,
+    unsigned int numLevels,
+    unsigned int flags __dparm(0));
+
+/**
+ * @brief Gets a mipmap level of a HIP mipmapped array
+ *
+ * @param[out] levelArray     - Returned mipmap level HIP array
+ * @param[in]  mipmappedArray - HIP mipmapped array
+ * @param[in]  level          - Mipmap level
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipGetMipmappedArrayLevel(
+    hipArray_t *levelArray,
+    hipMipmappedArray_const_t mipmappedArray,
+    unsigned int level);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst    Destination memory address
+ *  @param[in]   dpitch Pitch of destination memory
+ *  @param[in]   src    Source memory address
+ *  @param[in]   spitch Pitch of source memory
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height Height of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                       size_t height, hipMemcpyKind kind);
+
+/**
+ *  @brief Copies memory for 2D arrays.
+ *  @param[in]   pCopy Parameters for the memory copy
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+*/
+hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
+
+/**
+ *  @brief Copies memory for 2D arrays.
+ *  @param[in]   pCopy Parameters for the memory copy
+ *  @param[in]   stream Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+*/
+hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst    Destination memory address
+ *  @param[in]   dpitch Pitch of destination memory
+ *  @param[in]   src    Source memory address
+ *  @param[in]   spitch Pitch of source memory
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height Height of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @param[in]   stream Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                            size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   spitch  Pitch of source memory
+ *  @param[in]   width   Width of matrix transfer (columns in bytes)
+ *  @param[in]   height  Height of matrix transfer (rows)
+ *  @param[in]   kind    Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+                              size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   spitch  Pitch of source memory
+ *  @param[in]   width   Width of matrix transfer (columns in bytes)
+ *  @param[in]   height  Height of matrix transfer (rows)
+ *  @param[in]   kind    Type of transfer
+ *  @param[in]   stream    Accelerator view which the copy is being enqueued
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+                                   size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
+                                   hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst     Destination memory address
+ *  @param[in]   wOffset Destination starting X offset
+ *  @param[in]   hOffset Destination starting Y offset
+ *  @param[in]   src     Source memory address
+ *  @param[in]   count   size in bytes to copy
+ *  @param[in]   kind    Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+                            size_t count, hipMemcpyKind kind);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source memory address
+ *  @param[in]   woffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   count     Size in bytes to copy
+ *  @param[in]   kind      Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+                              size_t count, hipMemcpyKind kind);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   dpitch    Pitch of destination memory
+ *  @param[in]   src       Source memory address
+ *  @param[in]   wOffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   width     Width of matrix transfer (columns in bytes)
+ *  @param[in]   height    Height of matrix transfer (rows)
+ *  @param[in]   kind      Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
+
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   dpitch    Pitch of destination memory
+ *  @param[in]   src       Source memory address
+ *  @param[in]   wOffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   width     Width of matrix transfer (columns in bytes)
+ *  @param[in]   height    Height of matrix transfer (rows)
+ *  @param[in]   kind      Type of transfer
+ *  @param[in]   stream    Accelerator view which the copy is being enqueued
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source array
+ *  @param[in]   srcoffset Offset in bytes of source array
+ *  @param[in]   count     Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dstArray   Destination memory address
+ *  @param[in]   dstOffset  Offset in bytes of destination array
+ *  @param[in]   srcHost    Source host pointer
+ *  @param[in]   count      Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   p   3D memory copy parameters
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p);
+
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   p        3D memory copy parameters
+ *  @param[in]   stream   Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms* p, hipStream_t stream __dparm(0));
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   pCopy   3D memory copy parameters
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy);
+
+/**
+ *  @brief Copies data between host and device asynchronously.
+ *
+ *  @param[in]   pCopy    3D memory copy parameters
+ *  @param[in]   stream   Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream);
+
+// doxygen end Memory
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup PeerToPeer PeerToPeer Device Memory Access
+ *  @{
+ *  @warning PeerToPeer support is experimental.
+ *  This section describes the PeerToPeer device memory access functions of HIP runtime API.
+ */
+
+/**
+ * @brief Determine if a device can access a peer's memory.
+ *
+ * @param [out] canAccessPeer Returns the peer access capability (0 or 1)
+ * @param [in] device - device from where memory may be accessed.
+ * @param [in] peerDevice - device where memory is physically located
+ *
+ * Returns "1" in @p canAccessPeer if the specified @p device is capable
+ * of directly accessing memory physically located on peerDevice , or "0" if not.
+ *
+ * Returns "0" in @p canAccessPeer if deviceId == peerDeviceId, and both are valid devices : a
+ * device is not a peer of itself.
+ *
+ * @returns #hipSuccess,
+ * @returns #hipErrorInvalidDevice if deviceId or peerDeviceId are not valid devices
+ */
+hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId);
+
+
+/**
+ * @brief Enable direct access from current device's virtual address space to memory allocations
+ * physically located on a peer device.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the
+ * current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
+ * the address space of the current device when the memory is allocated. The peer memory remains
+ * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerDeviceId
+ * @param [in] flags
+ *
+ * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ */
+hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags);
+
+
+/**
+ * @brief Disable direct access from current device's virtual address space to memory allocations
+ * physically located on a peer device.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
+ * enabled from the current device.
+ *
+ * @param [in] peerDeviceId
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ */
+hipError_t hipDeviceDisablePeerAccess(int peerDeviceId);
+
+/**
+ * @brief Get information on memory allocations.
+ *
+ * @param [out] pbase - BAse pointer address
+ * @param [out] psize - Size of allocation
+ * @param [in]  dptr- Device Pointer
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevicePointer
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
+
+#ifndef USE_PEER_NON_UNIFIED
+#define USE_PEER_NON_UNIFIED 1
+#endif
+
+#if USE_PEER_NON_UNIFIED == 1
+/**
+ * @brief Copies memory from one device to memory on another device.
+ *
+ * @param [out] dst - Destination device pointer.
+ * @param [in] dstDeviceId - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDeviceId - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId,
+                         size_t sizeBytes);
+
+/**
+ * @brief Copies memory from one device to memory on another device.
+ *
+ * @param [out] dst - Destination device pointer.
+ * @param [in] dstDevice - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDevice - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ * @param [in] stream - Stream identifier
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeerAsync(void* dst, int dstDeviceId, const void* src, int srcDevice,
+                              size_t sizeBytes, hipStream_t stream __dparm(0));
+#endif
+
+
+// doxygen end PeerToPeer
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Context Context Management
+ *  @{
+ *  This section describes the context management functions of HIP runtime API.
+ */
+
+/**
+ *
+ *  @addtogroup ContextD Context Management [Deprecated]
+ *  @{
+ *  @ingroup Context
+ *  This section describes the deprecated context management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Create a context and set it as current/ default context
+ *
+ * @param [out] ctx
+ * @param [in] flags
+ * @param [in] associated device handle
+ *
+ * @return #hipSuccess
+ *
+ * @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent,
+ * hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device);
+
+/**
+ * @brief Destroy a HIP context.
+ *
+ * @param [in] ctx Context to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipCtxCreate, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,hipCtxSetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxDestroy(hipCtx_t ctx);
+
+/**
+ * @brief Pop the current/default context and return the popped context.
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxSetCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
+
+/**
+ * @brief Push the context to be set as current/ default context
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxPushCurrent(hipCtx_t ctx);
+
+/**
+ * @brief Set the passed context as current/default
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetCurrent(hipCtx_t ctx);
+
+/**
+ * @brief Get the handle of the current/ default context
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
+
+/**
+ * @brief Get the handle of the device associated with current/default context
+ *
+ * @param [out] device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize
+ */
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetDevice(hipDevice_t* device);
+
+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @param [in]  ctx Context to check
+ * @param [out] apiVersion
+ *
+ * @return #hipSuccess
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion);
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [out] cacheConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig);
+
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * @param [in] cacheConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig);
+
+/**
+ * @brief Set Shared memory bank configuration.
+ *
+ * @param [in] sharedMemoryConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config);
+
+/**
+ * @brief Get Shared memory bank configuration.
+ *
+ * @param [out] sharedMemoryConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
+
+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks.
+ *
+ * @return #hipSuccess
+ *
+ * @warning This function waits for all streams on the default context to complete execution, and
+ * then returns.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSynchronize(void);
+
+/**
+ * @brief Return flags used for creating default context.
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetFlags(unsigned int* flags);
+
+/**
+ * @brief Enables direct access to memory allocations in a peer context.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the
+ * current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
+ * the address space of the current device when the memory is allocated. The peer memory remains
+ * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx
+ * @param [in] flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * #hipErrorPeerAccessAlreadyEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags);
+
+/**
+ * @brief Disable direct access from current context's virtual address space to memory allocations
+ * physically located on a peer context.Disables direct access to memory allocations in a peer
+ * context and unregisters any registered allocations.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
+ * enabled from the current device.
+ *
+ * @param [in] peerCtx
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx);
+
+// doxygen end Context deprecated
+/**
+ * @}
+ */
+
+/**
+ * @brief Get the state of the primary context.
+ *
+ * @param [in] Device to get primary context flags for
+ * @param [out] Pointer to store flags
+ * @param [out] Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
+
+/**
+ * @brief Release the primary context on the GPU.
+ *
+ * @param [in] Device which primary context is released
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning This function return #hipSuccess though doesn't release the primaryCtx by design on
+ * HIP/HCC path.
+ */
+hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev);
+
+/**
+ * @brief Retain the primary context on the GPU.
+ *
+ * @param [out] Returned context handle of the new context
+ * @param [in] Device which primary context is released
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev);
+
+/**
+ * @brief Resets the primary context on the GPU.
+ *
+ * @param [in] Device which primary context is reset
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev);
+
+/**
+ * @brief Set flags for the primary context.
+ *
+ * @param [in] Device for which the primary context flags are set
+ * @param [in] New flags for the device
+ *
+ * @returns #hipSuccess, #hipErrorContextAlreadyInUse
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags);
+
+// doxygen end Context Management
+/**
+ * @}
+ */
+
+/**
+ *
+ *  @defgroup Module Module Management
+ *  @{
+ *  This section describes the module management functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Loads code object from file into a hipModule_t
+ *
+ * @param [in] fname
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound,
+ * hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
+ *
+ *
+ */
+hipError_t hipModuleLoad(hipModule_t* module, const char* fname);
+
+/**
+ * @brief Frees the module
+ *
+ * @param [in] module
+ *
+ * @returns hipSuccess, hipInvalidValue
+ * module is freed and the code objects associated with it are destroyed
+ *
+ */
+
+hipError_t hipModuleUnload(hipModule_t module);
+
+/**
+ * @brief Function with kname will be extracted if present in module
+ *
+ * @param [in] module
+ * @param [in] kname
+ * @param [out] function
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized,
+ * hipErrorNotFound,
+ */
+hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname);
+
+/**
+ * @brief Find out attributes for a given function.
+ *
+ * @param [out] attr
+ * @param [in] func
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
+ */
+
+hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func);
+
+/**
+ * @brief Find out a specific attribute for a given function.
+ *
+ * @param [out] value
+ * @param [in]  attrib
+ * @param [in]  hfunc
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
+ */
+hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc);
+
+/**
+ * @brief returns the handle of the texture reference with the name from the module.
+ *
+ * @param [in] hmod
+ * @param [in] name
+ * @param [out] texRef
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorNotFound, hipErrorInvalidValue
+ */
+hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
+
+/**
+ * @brief builds module from code object which resides in host memory. Image is pointer to that
+ * location.
+ *
+ * @param [in] image
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+
+/**
+ * @brief builds module from code object which resides in host memory. Image is pointer to that
+ * location. Options are not used. hipModuleLoadData is called.
+ *
+ * @param [in] image
+ * @param [out] module
+ * @param [in] number of options
+ * @param [in] options for JIT
+ * @param [in] option values for JIT
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
+                               hipJitOption* options, void** optionValues);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra
+ *
+ * @param [in] f         Kernel to launch.
+ * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
+ * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
+ * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ * @param [in] kernelParams
+ * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and
+ * must be in the memory layout and alignment expected by the kernel.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please
+ * refer to hip_porting_driver_api.md for sample usage.
+ */
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+                                 unsigned int gridDimZ, unsigned int blockDimX,
+                                 unsigned int blockDimY, unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes, hipStream_t stream,
+                                 void** kernelParams, void** extra);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
+ *
+ * @param [in] f         Kernel to launch.
+ * @param [in] gridDim   Grid dimensions specified as multiple of blockDim.
+ * @param [in] blockDim  Block dimensions specified in work-items
+ * @param [in] kernelParams A list of kernel arguments
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream);
+
+/**
+ * @brief Launches kernels on multiple devices where thread blocks can cooperate and
+ * synchronize as they execute.
+ *
+ * @param [in] hipLaunchParams          List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags);
+
+
+/**
+ * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
+ * on respective streams before enqueuing any other work on the specified streams from any other threads
+ *
+ *
+ * @param [in] hipLaunchParams          List of launch parameters, one per device.
+ * @param [in] numDevices               Size of the launchParamsList array.
+ * @param [in] flags                    Flags to control launch behavior.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ */
+hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                              int  numDevices, unsigned int  flags);
+
+
+// doxygen end Module
+/**
+ * @}
+ */
+
+/**
+ *
+ *  @defgroup Occupancy Occupancy
+ *  @{
+ *  This section describes the occupancy functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+
+//TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit);
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+//TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit, unsigned int  flags);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  func             Kernel function (hipFunction) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function(hipFunction_t) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  func             Kernel function for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault));
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize           minimum grid size for maximum potential occupancy
+ * @param [out] blockSize          block size for maximum potential occupancy
+ * @param [in]  f                  kernel function for which occupancy is calulated
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             const void* f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit);
+
+// doxygen end Occupancy
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Profiler Profiler Control[Deprecated]
+ *  @{
+ *  This section describes the profiler control functions of HIP runtime API.
+ *
+ *  @warning The cudaProfilerInitialize API format for "configFile" is not supported.
+ *
+ */
+
+
+// TODO - expand descriptions:
+/**
+ * @brief Start recording of profiling information
+ * When using this API, start the profiler with profiling disabled.  (--startdisabled)
+ * @warning : hipProfilerStart API is under development.
+ */
+DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStart();
+
+
+/**
+ * @brief Stop recording of profiling information.
+ * When using this API, start the profiler with profiling disabled.  (--startdisabled)
+ * @warning : hipProfilerStop API is under development.
+ */
+DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStop();
+
+// doxygen end profiler
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Clang Launch API to support the triple-chevron syntax
+ *  @{
+ *  This section describes the API to support the triple-chevron syntax.
+ */
+
+/**
+ * @brief Configure a kernel launch.
+ *
+ * @param [in] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim  block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0), hipStream_t stream __dparm(0));
+
+
+/**
+ * @brief Set a kernel argument.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @param [in] arg    Pointer the argument in host memory.
+ * @param [in] size   Size of the argument.
+ * @param [in] offset Offset of the argument on the argument stack.
+ *
+ */
+hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+
+
+/**
+ * @brief Launch a kernel.
+ *
+ * @param [in] func Kernel to launch.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t hipLaunchByPtr(const void* func);
+
+
+/**
+ * @brief Push configuration of a kernel launch.
+ *
+ * @param [in] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim  block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+
+hipError_t __hipPushCallConfiguration(dim3 gridDim,
+                                      dim3 blockDim,
+                                      size_t sharedMem __dparm(0),
+                                      hipStream_t stream __dparm(0));
+
+/**
+ * @brief Pop configuration of a kernel launch.
+ *
+ * @param [out] gridDim   grid dimension specified as multiple of blockDim.
+ * @param [out] blockDim  block dimensions specified in work-items
+ * @param [out] sharedMem Amount of dynamic shared memory to allocate for this kernel.  The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [out] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t __hipPopCallConfiguration(dim3 *gridDim,
+                                     dim3 *blockDim,
+                                     size_t *sharedMem,
+                                     hipStream_t *stream);
+
+/**
+ * @brief C compliant kernel launch API
+ *
+ * @param [in] function_address - kernel stub function pointer.
+ * @param [in] numBlocks - number of blocks
+ * @param [in] dimBlocks - dimension of a block
+ * @param [in] args - kernel arguments
+ * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
+ * HIP-Clang compiler provides support for extern shared declarations.
+ * @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
+ *  default stream is used with associated synchronization rules.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, hipInvalidDevice
+ *
+ */
+hipError_t hipLaunchKernel(const void* function_address,
+                           dim3 numBlocks,
+                           dim3 dimBlocks,
+                           void** args,
+                           size_t sharedMemBytes __dparm(0),
+                           hipStream_t stream __dparm(0));
+/**
+ * Copies memory for 2D arrays.
+ *
+ * @param pCopy           - Parameters for the memory copy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy);
+
+//TODO: Move this to hip_ext.h
+hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+                              void** args, size_t sharedMemBytes, hipStream_t stream,
+                              hipEvent_t startEvent, hipEvent_t stopEvent, int flags);
+// doxygen end Clang launch
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Textur Texture Management
+ *  @{
+ *  This section describes the texture management functions of HIP runtime API.
+ */
+
+/**
+ *
+ *  @addtogroup TexturD Texture Management [Deprecated]
+ *  @{
+ *  @ingroup Texture
+ *  This section describes the deprecated texture management functions of HIP runtime API.
+ */
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture(
+    size_t* offset,
+    const textureReference* tex,
+    const void* devPtr,
+    const hipChannelFormatDesc* desc,
+    size_t size __dparm(UINT_MAX));
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture2D(
+    size_t* offset,
+    const textureReference* tex,
+    const void* devPtr,
+    const hipChannelFormatDesc* desc,
+    size_t width,
+    size_t height,
+    size_t pitch);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(
+    const textureReference* tex,
+    hipArray_const_t array,
+    const hipChannelFormatDesc* desc);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipGetTextureAlignmentOffset(
+    size_t* offset,
+    const textureReference* texref);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipUnbindTexture(const textureReference* tex);
+
+// doxygen end deprecated texture management
+/**
+ * @}
+ */
+
+hipError_t hipBindTextureToMipmappedArray(
+    const textureReference* tex,
+    hipMipmappedArray_const_t mipmappedArray,
+    const hipChannelFormatDesc* desc);
+
+ hipError_t hipGetTextureReference(
+    const textureReference** texref,
+    const void* symbol);
+
+hipError_t hipCreateTextureObject(
+    hipTextureObject_t* pTexObject,
+    const hipResourceDesc* pResDesc,
+    const hipTextureDesc* pTexDesc,
+    const struct hipResourceViewDesc* pResViewDesc);
+
+hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
+
+hipError_t hipGetChannelDesc(
+    hipChannelFormatDesc* desc,
+    hipArray_const_t array);
+
+hipError_t hipGetTextureObjectResourceDesc(
+    hipResourceDesc* pResDesc,
+    hipTextureObject_t textureObject);
+
+hipError_t hipGetTextureObjectResourceViewDesc(
+    struct hipResourceViewDesc* pResViewDesc,
+    hipTextureObject_t textureObject);
+
+hipError_t hipGetTextureObjectTextureDesc(
+    hipTextureDesc* pTexDesc,
+    hipTextureObject_t textureObject);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetAddress(
+    hipDeviceptr_t* dev_ptr,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetAddressMode(
+    enum hipTextureAddressMode* pam,
+    const textureReference* texRef,
+    int dim);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetFilterMode(
+    enum hipTextureFilterMode* pfm,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetFlags(
+    unsigned int* pFlags,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetFormat(
+    hipArray_Format* pFormat,
+    int* pNumChannels,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetMaxAnisotropy(
+    int* pmaxAnsio,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapFilterMode(
+    enum hipTextureFilterMode* pfm,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapLevelBias(
+    float* pbias,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetMipmapLevelClamp(
+    float* pminMipmapLevelClamp,
+    float* pmaxMipmapLevelClamp,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefGetMipMappedArray(
+    hipMipmappedArray_t* pArray,
+    const textureReference* texRef);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefSetAddress(
+    size_t* ByteOffset,
+    textureReference* texRef,
+    hipDeviceptr_t dptr,
+    size_t bytes);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefSetAddress2D(
+    textureReference* texRef,
+    const HIP_ARRAY_DESCRIPTOR* desc,
+    hipDeviceptr_t dptr,
+    size_t Pitch);
+
+hipError_t hipTexRefSetAddressMode(
+    textureReference* texRef,
+    int dim,
+    enum hipTextureAddressMode am);
+
+hipError_t hipTexRefSetArray(
+    textureReference* tex,
+    hipArray_const_t array,
+    unsigned int flags);
+
+hipError_t hipTexRefSetFilterMode(
+    textureReference* texRef,
+    enum hipTextureFilterMode fm);
+
+hipError_t hipTexRefSetFlags(
+    textureReference* texRef,
+    unsigned int Flags);
+
+hipError_t hipTexRefSetFormat(
+    textureReference* texRef,
+    hipArray_Format fmt,
+    int NumPackedComponents);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefSetMaxAnisotropy(
+    textureReference* texRef,
+    unsigned int maxAniso);
+
+hipError_t hipTexObjectCreate(
+    hipTextureObject_t* pTexObject,
+    const HIP_RESOURCE_DESC* pResDesc,
+    const HIP_TEXTURE_DESC* pTexDesc,
+    const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
+
+hipError_t hipTexObjectDestroy(
+    hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetResourceDesc(
+    HIP_RESOURCE_DESC* pResDesc,
+    hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetResourceViewDesc(
+    HIP_RESOURCE_VIEW_DESC* pResViewDesc,
+    hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetTextureDesc(
+    HIP_TEXTURE_DESC* pTexDesc,
+    hipTextureObject_t texObject);
+
+// doxygen end Texture management
+/**
+ * @}
+ */
+
+// The following are not supported.
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipTexRefSetBorderColor(
+    textureReference* texRef,
+    float* pBorderColor);
+
+hipError_t hipTexRefSetMipmapFilterMode(
+    textureReference* texRef,
+    enum hipTextureFilterMode fm);
+
+hipError_t hipTexRefSetMipmapLevelBias(
+    textureReference* texRef,
+    float bias);
+
+hipError_t hipTexRefSetMipmapLevelClamp(
+    textureReference* texRef,
+    float minMipMapLevelClamp,
+    float maxMipMapLevelClamp);
+
+hipError_t hipTexRefSetMipmappedArray(
+    textureReference* texRef,
+    struct hipMipmappedArray* mipmappedArray,
+    unsigned int Flags);
+
+hipError_t hipMipmappedArrayCreate(
+    hipMipmappedArray_t* pHandle,
+    HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
+    unsigned int numMipmapLevels);
+
+hipError_t hipMipmappedArrayDestroy(
+    hipMipmappedArray_t hMipmappedArray);
+
+hipError_t hipMipmappedArrayGetLevel(
+    hipArray_t* pLevelArray,
+    hipMipmappedArray_t hMipMappedArray,
+    unsigned int level);
+
+/**
+ * Callback/Activity API
+ */
+hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg);
+hipError_t hipRemoveApiCallback(uint32_t id);
+hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg);
+hipError_t hipRemoveActivityCallback(uint32_t id);
+const char* hipApiName(uint32_t id);
+const char* hipKernelNameRef(const hipFunction_t f);
+const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream);
+int hipGetStreamDeviceId(hipStream_t stream);
+
+#ifdef __cplusplus
+/**
+ * An opaque value that represents a hip graph
+ */
+class hipGraph;
+typedef hipGraph* hipGraph_t;
+
+/**
+ * An opaque value that represents a hip graph node
+ */
+class hipGraphNode;
+typedef hipGraphNode* hipGraphNode_t;
+
+/**
+ * An opaque value that represents a hip graph Exec
+ */
+class hipGraphExec;
+typedef hipGraphExec* hipGraphExec_t;
+typedef enum hipGraphNodeType {
+  hipGraphNodeTypeKernel = 1,             ///< GPU kernel node
+  hipGraphNodeTypeMemcpy = 2,             ///< Memcpy 3D node
+  hipGraphNodeTypeMemset = 3,             ///< Memset 1D node
+  hipGraphNodeTypeHost = 4,               ///< Host (executable) node
+  hipGraphNodeTypeGraph = 5,              ///< Node which executes an embedded graph
+  hipGraphNodeTypeEmpty = 6,              ///< Empty (no-op) node
+  hipGraphNodeTypeWaitEvent = 7,          ///< External event wait node
+  hipGraphNodeTypeEventRecord = 8,        ///< External event record node
+  hipGraphNodeTypeMemcpy1D = 9,           ///< Memcpy 1D node
+  hipGraphNodeTypeMemcpyFromSymbol = 10,  ///< MemcpyFromSymbol node
+  hipGraphNodeTypeMemcpyToSymbol = 11,    ///< MemcpyToSymbol node
+  hipGraphNodeTypeCount
+} hipGraphNodeType;
+
+typedef void (*hipHostFn_t)(void* userData);
+typedef struct hipHostNodeParams {
+  hipHostFn_t fn;
+  void* userData;
+} hipHostNodeParams;
+
+typedef struct hipKernelNodeParams {
+  dim3 blockDim;
+  void** extra;
+  void* func;
+  dim3 gridDim;
+  void** kernelParams;
+  unsigned int sharedMemBytes;
+} hipKernelNodeParams;
+
+typedef struct hipMemsetParams {
+  void* dst;
+  unsigned int elementSize;
+  size_t height;
+  size_t pitch;
+  unsigned int value;
+  size_t width;
+} hipMemsetParams;
+
+enum hipGraphExecUpdateResult {
+  hipGraphExecUpdateSuccess = 0x0,  ///< The update succeeded
+  hipGraphExecUpdateError = 0x1,  ///< The update failed for an unexpected reason which is described
+                                  ///< in the return value of the function
+  hipGraphExecUpdateErrorTopologyChanged = 0x2,  ///< The update failed because the topology changed
+  hipGraphExecUpdateErrorNodeTypeChanged = 0x3,  ///< The update failed because a node type changed
+  hipGraphExecUpdateErrorFunctionChanged =
+      0x4,  ///< The update failed because the function of a kernel node changed
+  hipGraphExecUpdateErrorParametersChanged =
+      0x5,  ///< The update failed because the parameters changed in a way that is not supported
+  hipGraphExecUpdateErrorNotSupported =
+      0x6,  ///< The update failed because something about the node is not supported
+  hipGraphExecUpdateErrorUnsupportedFunctionChange = 0x7
+};
+
+enum hipStreamCaptureMode {
+  hipStreamCaptureModeGlobal = 0,
+  hipStreamCaptureModeThreadLocal,
+  hipStreamCaptureModeRelaxed
+};
+
+enum hipStreamCaptureStatus {
+  hipStreamCaptureStatusNone = 0,    ///< Stream is not capturing
+  hipStreamCaptureStatusActive,      ///< Stream is actively capturing
+  hipStreamCaptureStatusInvalidated  ///< Stream is part of a capture sequence that has been
+                                     ///< invalidated, but not terminated
+};
+
+hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode);
+
+hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
+
+// Creates a graph.
+hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
+
+// Destroys a graph.
+hipError_t hipGraphDestroy(hipGraph_t graph);
+
+// Destroys an executable graph.
+hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec);
+
+// Creates an executable graph from a graph.
+hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                               hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize);
+
+// Launches an executable graph in a stream.
+hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
+
+// Creates a kernel execution node and adds it to a graph.
+hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipKernelNodeParams* pNodeParams);
+
+// Creates a memcpy node and adds it to a graph.
+hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemcpy3DParms* pCopyParams);
+
+// Creates a memset node and adds it to a graph.
+hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemsetParams* pMemsetParams);
+#endif
+// doxygen end graph API
+/**
+ * @}
+ */
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
+
+
+#if USE_PROF_API
+#include <hip/amd_detail/hip_prof_str.h>
+#endif
+
+#ifdef __cplusplus
+
+#if defined(__clang__) && defined(__HIP__)
+template <typename T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+    T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+    return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
+}
+
+template <typename T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+    T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int  flags = 0 ) {
+    return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
+}
+#endif // defined(__clang__) && defined(__HIP__)
+
+template <typename T>
+hipError_t hipGetSymbolAddress(void** devPtr, const T &symbol) {
+  return ::hipGetSymbolAddress(devPtr, (const void *)&symbol);
+}
+
+template <typename T>
+hipError_t hipGetSymbolSize(size_t* size, const T &symbol) {
+  return ::hipGetSymbolSize(size, (const void *)&symbol);
+}
+
+template <typename T>
+hipError_t hipMemcpyToSymbol(const T& symbol, const void* src, size_t sizeBytes,
+                             size_t offset __dparm(0),
+                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
+  return ::hipMemcpyToSymbol((const void*)&symbol, src, sizeBytes, offset, kind);
+}
+
+template <typename T>
+hipError_t hipMemcpyToSymbolAsync(const T& symbol, const void* src, size_t sizeBytes, size_t offset,
+                                  hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+  return ::hipMemcpyToSymbolAsync((const void*)&symbol, src, sizeBytes, offset, kind, stream);
+}
+
+template <typename T>
+hipError_t hipMemcpyFromSymbol(void* dst, const T &symbol,
+                               size_t sizeBytes, size_t offset __dparm(0),
+                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+  return ::hipMemcpyFromSymbol(dst, (const void*)&symbol, sizeBytes, offset, kind);
+}
+
+template <typename T>
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const T& symbol, size_t sizeBytes, size_t offset,
+                                    hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+  return ::hipMemcpyFromSymbolAsync(dst, (const void*)&symbol, sizeBytes, offset, kind, stream);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
+}
+
+template <typename F>
+inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                                    F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
+return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kernel, dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <class T>
+inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+    return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim,
+                                      blockDim, kernelParams, sharedMemBytes, stream);
+}
+
+template <class T>
+inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                        unsigned int  numDevices, unsigned int  flags = 0) {
+    return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+
+template <class T>
+inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                     unsigned int  numDevices, unsigned int  flags = 0) {
+    return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+
+hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
+
+hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
+
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
+    return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size);
+}
+
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex, const void* devPtr,
+                   const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
+    return hipBindTexture(offset, &tex, devPtr, &desc, size);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture2D(
+    size_t *offset,
+    const struct texture<T, dim, readMode> &tex,
+    const void *devPtr,
+    size_t width,
+    size_t height,
+    size_t pitch)
+{
+    return hipBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture2D(
+  size_t *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void *devPtr,
+  const struct hipChannelFormatDesc &desc,
+  size_t width,
+  size_t height,
+  size_t pitch)
+{
+  return hipBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToArray(
+    const struct texture<T, dim, readMode> &tex,
+    hipArray_const_t array)
+{
+    struct hipChannelFormatDesc desc;
+    hipError_t err = hipGetChannelDesc(&desc, array);
+    return (err == hipSuccess) ? hipBindTextureToArray(&tex, array, &desc) : err;
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToArray(
+    const struct texture<T, dim, readMode> &tex,
+    hipArray_const_t array,
+    const struct hipChannelFormatDesc &desc)
+{
+    return hipBindTextureToArray(&tex, array, &desc);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+static inline hipError_t hipBindTextureToMipmappedArray(
+    const struct texture<T, dim, readMode> &tex,
+    hipMipmappedArray_const_t mipmappedArray)
+{
+    struct hipChannelFormatDesc desc;
+    hipArray_t levelArray;
+    hipError_t err = hipGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+    if (err != hipSuccess) {
+        return err;
+    }
+    err = hipGetChannelDesc(&desc, levelArray);
+    return (err == hipSuccess) ? hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc) : err;
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+static inline hipError_t hipBindTextureToMipmappedArray(
+    const struct texture<T, dim, readMode> &tex,
+    hipMipmappedArray_const_t mipmappedArray,
+    const struct hipChannelFormatDesc &desc)
+{
+    return hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipUnbindTexture(
+    const struct texture<T, dim, readMode> &tex)
+{
+    return hipUnbindTexture(&tex);
+}
+
+// doxygen end Texture
+/**
+ * @}
+ */
+
+#endif // __cplusplus
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+// doxygen end HIP API
+/**
+ *   @}
+ */
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_API_H
diff --git a/include/hip/amd_detail/hip_runtime_prof.h b/include/hip/amd_detail/hip_runtime_prof.h
new file mode 100644
index 0000000000..a45962d459
--- /dev/null
+++ b/include/hip/amd_detail/hip_runtime_prof.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP ROCclr Op IDs enumeration
+enum HipVdiOpId {
+  kHipVdiOpIdDispatch = 0,
+  kHipVdiOpIdCopy     = 1,
+  kHipVdiOpIdBarrier  = 2,
+  kHipVdiOpIdNumber   = 3
+};
+
+// Types of ROCclr commands
+enum HipVdiCommandKind {
+  kHipVdiCommandKernel            = 0x11F0,
+  kHipVdiMemcpyDeviceToHost       = 0x11F3,
+  kHipHipVdiMemcpyHostToDevice    = 0x11F4,
+  kHipVdiMemcpyDeviceToDevice     = 0x11F5,
+  kHipVidMemcpyDeviceToHostRect   = 0x1201,
+  kHipVdiMemcpyHostToDeviceRect   = 0x1202,
+  kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+  kHipVdiFillMemory               = 0x1207,
+}; 
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg         Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op      Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable  Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id      Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+
diff --git a/include/hip/amd_detail/hip_surface_types.h b/include/hip/amd_detail/hip_surface_types.h
new file mode 100644
index 0000000000..2c53d19558
--- /dev/null
+++ b/include/hip/amd_detail/hip_surface_types.h
@@ -0,0 +1,54 @@
+/*
+Copyright (c) 2015- present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_surface_types.h
+ *  @brief Defines surface types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_SURFACE_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_SURFACE_TYPES_H
+
+#include <hip/amd_detail/driver_types.h>
+
+/**
+ * An opaque value that represents a hip surface object
+ */
+typedef unsigned long long hipSurfaceObject_t;
+
+/**
+ * hip surface reference
+ */
+struct surfaceReference {
+    hipSurfaceObject_t surfaceObject;
+};
+
+/**
+ * hip surface boundary modes
+ */
+enum hipSurfaceBoundaryMode {
+    hipBoundaryModeZero = 0,
+    hipBoundaryModeTrap = 1,
+    hipBoundaryModeClamp = 2
+};
+
+#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_HIP_SURFACE_TYPES_H */
diff --git a/include/hip/amd_detail/hip_texture_types.h b/include/hip/amd_detail/hip_texture_types.h
new file mode 100644
index 0000000000..0dc40ec0ed
--- /dev/null
+++ b/include/hip/amd_detail/hip_texture_types.h
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_texture_types.h
+ *  @brief Defines the different newt vector types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_TEXTURE_TYPES_H
+
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if !defined(__HIPCC_RTC__)
+#include <limits.h>
+#include <hip/amd_detail/channel_descriptor.h>
+#endif // !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/texture_types.h>
+
+#if __cplusplus
+
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if __HIP__
+#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
+#else
+#define __HIP_TEXTURE_ATTRIB
+#endif
+
+typedef textureReference* hipTexRef;
+
+template <class T, int texType = hipTextureType1D,
+          enum hipTextureReadMode mode = hipReadModeElementType>
+struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
+    texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
+            enum hipTextureAddressMode aMode = hipAddressModeClamp) {
+        normalized = norm;
+        readMode = mode;
+        filterMode = fMode;
+        addressMode[0] = aMode;
+        addressMode[1] = aMode;
+        addressMode[2] = aMode;
+        channelDesc = hipCreateChannelDesc<T>();
+        sRGB = 0;
+        textureObject = nullptr;
+        maxAnisotropy = 0;
+        mipmapLevelBias = 0;
+        minMipmapLevelClamp = 0;
+        maxMipmapLevelClamp = 0;
+    }
+
+    texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
+            struct hipChannelFormatDesc desc) {
+        normalized = norm;
+        readMode = mode;
+        filterMode = fMode;
+        addressMode[0] = aMode;
+        addressMode[1] = aMode;
+        addressMode[2] = aMode;
+        channelDesc = desc;
+        sRGB = 0;
+        textureObject = nullptr;
+        maxAnisotropy = 0;
+        mipmapLevelBias = 0;
+        minMipmapLevelClamp = 0;
+        maxMipmapLevelClamp = 0;
+    }
+};
+
+#endif /* __cplusplus */
+
+#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_HIP_TEXTURE_TYPES_H */
diff --git a/include/hip/amd_detail/hip_vector_types.h b/include/hip/amd_detail/hip_vector_types.h
new file mode 100644
index 0000000000..c102750531
--- /dev/null
+++ b/include/hip/amd_detail/hip_vector_types.h
@@ -0,0 +1,1598 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_vector_types.h
+ *  @brief Defines the different newt vector types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_VECTOR_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_VECTOR_TYPES_H
+
+#include "hip/amd_detail/host_defines.h"
+
+#if defined(__HIPCC_RTC__)
+    #define __HOST_DEVICE__ __device__
+#else
+    #define __HOST_DEVICE__ __host__ __device__
+#endif
+
+#if defined(__has_attribute)
+    #if __has_attribute(ext_vector_type)
+        #define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
+    #else
+        #define __NATIVE_VECTOR__(n, T) T[n]
+    #endif
+
+#if defined(__cplusplus)
+// FIXME: Temporarily enable C++ headers while mainline headers is outdated.
+//#if !defined(__HIPCC_RTC__)
+    #include <array>
+    #include <iosfwd>
+    #include <type_traits>
+//#endif // !defined(__HIPCC_RTC__)
+
+    namespace hip_impl {
+        template<typename, typename, unsigned int> struct Scalar_accessor;
+    } // Namespace hip_impl.
+
+    namespace std {
+        template<typename T, typename U, unsigned int n>
+        struct is_integral<hip_impl::Scalar_accessor<T, U, n>>
+            : is_integral<T> {};
+        template<typename T, typename U, unsigned int n>
+        struct is_floating_point<hip_impl::Scalar_accessor<T, U, n>>
+            : is_floating_point<T> {};
+    } // Namespace std.
+
+    namespace hip_impl {
+        template<typename T, typename Vector, unsigned int idx>
+        struct Scalar_accessor {
+            struct Address {
+                const Scalar_accessor* p;
+
+                __HOST_DEVICE__
+                operator const T*() const noexcept {
+                    return &reinterpret_cast<const T*>(p)[idx];
+                }
+                __HOST_DEVICE__
+                operator const T*() const volatile noexcept {
+                    return &reinterpret_cast<const T*>(p)[idx];
+                }
+                __HOST_DEVICE__
+                operator T*() noexcept {
+                    return &reinterpret_cast<T*>(
+                        const_cast<Scalar_accessor*>(p))[idx];
+                }
+                __HOST_DEVICE__
+                operator T*() volatile noexcept {
+                    return &reinterpret_cast<T*>(
+                        const_cast<Scalar_accessor*>(p))[idx];
+                }
+            };
+
+            friend
+            inline
+            std::ostream& operator<<(std::ostream& os,
+                                     const Scalar_accessor& x) noexcept {
+                return os << x.data[idx];
+            }
+            friend
+            inline
+            std::istream& operator>>(std::istream& is,
+                                     Scalar_accessor& x) noexcept {
+                T tmp;
+                is >> tmp;
+                x.data[idx] = tmp;
+
+                return is;
+            }
+
+            // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
+            Vector data;
+
+            __HOST_DEVICE__
+            operator T() const noexcept { return data[idx]; }
+            __HOST_DEVICE__
+            operator T() const volatile noexcept { return data[idx]; }
+
+#ifdef __HIP_ENABLE_VECTOR_SCALAR_ACCESSORY_ENUM_CONVERSION__
+            // The conversions to enum are fairly ghastly, but unfortunately used in
+            // some pre-existing, difficult to modify, code.
+            template<
+                typename U,
+                typename std::enable_if<
+                    !std::is_same<U, T>{} &&
+                    std::is_enum<U>{} &&
+                    std::is_convertible<
+                        T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            operator U() const noexcept { return static_cast<U>(data[idx]); }
+            template<
+                typename U,
+                typename std::enable_if<
+                    !std::is_same<U, T>{} &&
+                    std::is_enum<U>{} &&
+                    std::is_convertible<
+                        T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            operator U() const volatile noexcept { return static_cast<U>(data[idx]); }
+#endif
+
+            __HOST_DEVICE__
+            operator T&() noexcept {
+                return reinterpret_cast<
+                    T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
+            }
+            __HOST_DEVICE__
+            operator volatile T&() volatile noexcept {
+                return reinterpret_cast<
+                    volatile T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
+            }
+
+            __HOST_DEVICE__
+            Address operator&() const noexcept { return Address{this}; }
+
+            __HOST_DEVICE__
+            Scalar_accessor& operator=(const Scalar_accessor& x) noexcept {
+                data[idx] = x.data[idx];
+
+                return *this;
+            }
+            __HOST_DEVICE__
+            Scalar_accessor& operator=(T x) noexcept {
+                data[idx] = x;
+
+                return *this;
+            }
+            __HOST_DEVICE__
+            volatile Scalar_accessor& operator=(T x) volatile noexcept {
+                data[idx] = x;
+
+                return *this;
+            }
+
+            __HOST_DEVICE__
+            Scalar_accessor& operator++() noexcept {
+                ++data[idx];
+                return *this;
+            }
+            __HOST_DEVICE__
+            T operator++(int) noexcept {
+                auto r{data[idx]};
+                ++data[idx];
+                return *this;
+            }
+            __HOST_DEVICE__
+            Scalar_accessor& operator--() noexcept {
+                --data[idx];
+                return *this;
+            }
+            __HOST_DEVICE__
+            T operator--(int) noexcept {
+                auto r{data[idx]};
+                --data[idx];
+                return *this;
+            }
+
+            // TODO: convertibility is too restrictive, constraint should be on
+            //       the operator being invocable with a value of type U.
+            template<
+                typename U,
+                typename std::enable_if<
+                    std::is_convertible<U, T>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator+=(U x) noexcept {
+                data[idx] += x;
+                return *this;
+            }
+            template<
+                typename U,
+                typename std::enable_if<
+                    std::is_convertible<U, T>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator-=(U x) noexcept {
+                data[idx] -= x;
+                return *this;
+            }
+
+            template<
+                typename U,
+                typename std::enable_if<
+                    std::is_convertible<U, T>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator*=(U x) noexcept {
+                data[idx] *= x;
+                return *this;
+            }
+            template<
+                typename U,
+                typename std::enable_if<
+                    std::is_convertible<U, T>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator/=(U x) noexcept {
+                data[idx] /= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator%=(U x) noexcept {
+                data[idx] %= x;
+                return *this;
+            }
+
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator>>=(U x) noexcept {
+                data[idx] >>= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator<<=(U x) noexcept {
+                data[idx] <<= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator&=(U x) noexcept {
+                data[idx] &= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator|=(U x) noexcept {
+                data[idx] |= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_convertible<U, T>{} &&
+                                        std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Scalar_accessor& operator^=(U x) noexcept {
+                data[idx] ^= x;
+                return *this;
+            }
+        };
+
+        inline
+        constexpr
+        unsigned int next_pot(unsigned int x) {
+            // Precondition: x > 1.
+	        return 1u << (32u - __builtin_clz(x - 1u));
+        }
+    } // Namespace hip_impl.
+
+    template<typename T, unsigned int n> struct HIP_vector_base;
+
+    template<typename T>
+    struct HIP_vector_base<T, 1> {
+        using Native_vec_ = __NATIVE_VECTOR__(1, T);
+
+        union {
+            Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+            struct {
+                T x;
+            };
+#else
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+#endif
+        };
+
+        using value_type = T;
+
+        __HOST_DEVICE__
+        HIP_vector_base() = default;
+        __HOST_DEVICE__
+        explicit
+        constexpr
+        HIP_vector_base(T x_) noexcept : data{x_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(const HIP_vector_base&) = default;
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(HIP_vector_base&&) = default;
+        __HOST_DEVICE__
+        ~HIP_vector_base() = default;
+
+        __HOST_DEVICE__
+        HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
+            #if __has_attribute(ext_vector_type)
+                data = x_.data;
+            #else
+                data[0] = x_.data[0];
+            #endif
+
+            return *this;
+        }
+    };
+
+    template<typename T>
+    struct HIP_vector_base<T, 2> {
+        using Native_vec_ = __NATIVE_VECTOR__(2, T);
+
+        union
+        #if !__has_attribute(ext_vector_type)
+            alignas(hip_impl::next_pot(2 * sizeof(T)))
+        #endif
+        {
+            Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+            struct {
+                T x;
+                T y;
+            };
+#else
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+            hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
+#endif
+        };
+
+        using value_type = T;
+
+        __HOST_DEVICE__
+        HIP_vector_base() = default;
+        __HOST_DEVICE__
+        explicit
+        constexpr
+        HIP_vector_base(T x_) noexcept : data{x_, x_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(const HIP_vector_base&) = default;
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(HIP_vector_base&&) = default;
+        __HOST_DEVICE__
+        ~HIP_vector_base() = default;
+
+        __HOST_DEVICE__
+        HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
+            #if __has_attribute(ext_vector_type)
+                data = x_.data;
+            #else
+                data[0] = x_.data[0];
+                data[1] = x_.data[1];
+            #endif
+
+            return *this;
+        }
+    };
+
+    template<typename T>
+    struct HIP_vector_base<T, 3> {
+        struct Native_vec_ {
+            T d[3];
+
+            __HOST_DEVICE__
+            Native_vec_() = default;
+
+            __HOST_DEVICE__
+            explicit
+            constexpr
+            Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
+            __HOST_DEVICE__
+            constexpr
+            Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
+            __HOST_DEVICE__
+            constexpr
+            Native_vec_(const Native_vec_&) = default;
+            __HOST_DEVICE__
+            constexpr
+            Native_vec_(Native_vec_&&) = default;
+            __HOST_DEVICE__
+            ~Native_vec_() = default;
+
+            __HOST_DEVICE__
+            Native_vec_& operator=(const Native_vec_&) = default;
+            __HOST_DEVICE__
+            Native_vec_& operator=(Native_vec_&&) = default;
+
+            __HOST_DEVICE__
+            T& operator[](unsigned int idx) noexcept { return d[idx]; }
+            __HOST_DEVICE__
+            T operator[](unsigned int idx) const noexcept { return d[idx]; }
+
+            __HOST_DEVICE__
+            Native_vec_& operator+=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
+                return *this;
+            }
+            __HOST_DEVICE__
+            Native_vec_& operator-=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
+                return *this;
+            }
+
+            __HOST_DEVICE__
+            Native_vec_& operator*=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
+                return *this;
+            }
+            __HOST_DEVICE__
+            Native_vec_& operator/=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
+                return *this;
+            }
+
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_ operator-() const noexcept
+            {
+                auto r{*this};
+                for (auto&& x : r.d) x = -x;
+                return r;
+            }
+
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_ operator~() const noexcept
+            {
+                auto r{*this};
+                for (auto&& x : r.d) x = ~x;
+                return r;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator%=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator^=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator|=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator&=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator>>=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __HOST_DEVICE__
+            Native_vec_& operator<<=(const Native_vec_& x_) noexcept
+            {
+                for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
+                return *this;
+            }
+
+            using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
+            __HOST_DEVICE__
+            Vec3_cmp operator==(const Native_vec_& x_) const noexcept
+            {
+                return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
+            }
+        };
+
+        union {
+            Native_vec_ data;
+            struct {
+                T x;
+                T y;
+                T z;
+            };
+        };
+
+        using value_type = T;
+
+        __HOST_DEVICE__
+        HIP_vector_base() = default;
+        __HOST_DEVICE__
+        explicit
+        constexpr
+        HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(const HIP_vector_base&) = default;
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(HIP_vector_base&&) = default;
+        __HOST_DEVICE__
+        ~HIP_vector_base() = default;
+
+        __HOST_DEVICE__
+        HIP_vector_base& operator=(const HIP_vector_base&) = default;
+        __HOST_DEVICE__
+        HIP_vector_base& operator=(HIP_vector_base&&) = default;
+    };
+
+    template<typename T>
+    struct HIP_vector_base<T, 4> {
+        using Native_vec_ = __NATIVE_VECTOR__(4, T);
+
+        union
+        #if !__has_attribute(ext_vector_type)
+            alignas(hip_impl::next_pot(4 * sizeof(T)))
+        #endif
+        {
+            Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+            struct {
+                T x;
+                T y;
+                T z;
+                T w;
+            };
+#else
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+            hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
+            hip_impl::Scalar_accessor<T, Native_vec_, 2> z;
+            hip_impl::Scalar_accessor<T, Native_vec_, 3> w;
+#endif
+        };
+
+        using value_type = T;
+
+        __HOST_DEVICE__
+        HIP_vector_base() = default;
+        __HOST_DEVICE__
+        explicit
+        constexpr
+        HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(const HIP_vector_base&) = default;
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_base(HIP_vector_base&&) = default;
+        __HOST_DEVICE__
+        ~HIP_vector_base() = default;
+
+        __HOST_DEVICE__
+        HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
+            #if __has_attribute(ext_vector_type)
+                data = x_.data;
+            #else
+                data[0] = x_.data[0];
+                data[1] = x_.data[1];
+                data[2] = x_.data[2];
+                data[3] = x_.data[3];
+            #endif
+
+            return *this;
+        }
+    };
+
+    template<typename T, unsigned int rank>
+    struct HIP_vector_type : public HIP_vector_base<T, rank> {
+        using HIP_vector_base<T, rank>::data;
+        using typename HIP_vector_base<T, rank>::Native_vec_;
+
+        __HOST_DEVICE__
+        HIP_vector_type() = default;
+        template<
+            typename U,
+            typename std::enable_if<
+                std::is_convertible<U, T>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        explicit
+        constexpr
+        HIP_vector_type(U x_) noexcept
+            : HIP_vector_base<T, rank>{static_cast<T>(x_)}
+        {}
+        template< // TODO: constrain based on type as well.
+            typename... Us,
+            typename std::enable_if<
+                (rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_type(Us... xs) noexcept
+            : HIP_vector_base<T, rank>{static_cast<T>(xs)...}
+        {}
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_type(const HIP_vector_type&) = default;
+        __HOST_DEVICE__
+        constexpr
+        HIP_vector_type(HIP_vector_type&&) = default;
+        __HOST_DEVICE__
+        ~HIP_vector_type() = default;
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator=(const HIP_vector_type&) = default;
+        __HOST_DEVICE__
+        HIP_vector_type& operator=(HIP_vector_type&&) = default;
+
+        // Operators
+        __HOST_DEVICE__
+        HIP_vector_type& operator++() noexcept
+        {
+            return *this += HIP_vector_type{1};
+        }
+        __HOST_DEVICE__
+        HIP_vector_type operator++(int) noexcept
+        {
+            auto tmp(*this);
+            ++*this;
+            return tmp;
+        }
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator--() noexcept
+        {
+            return *this -= HIP_vector_type{1};
+        }
+        __HOST_DEVICE__
+        HIP_vector_type operator--(int) noexcept
+        {
+            auto tmp(*this);
+            --*this;
+            return tmp;
+        }
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
+        {
+            data += x.data;
+            return *this;
+        }
+        template<
+            typename U,
+            typename std::enable_if<
+                std::is_convertible<U, T>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator+=(U x) noexcept
+        {
+            return *this += HIP_vector_type{x};
+        }
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
+        {
+            data -= x.data;
+            return *this;
+        }
+        template<
+            typename U,
+            typename std::enable_if<
+                std::is_convertible<U, T>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator-=(U x) noexcept
+        {
+            return *this -= HIP_vector_type{x};
+        }
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
+        {
+            data *= x.data;
+            return *this;
+        }
+        template<
+            typename U,
+            typename std::enable_if<
+                std::is_convertible<U, T>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator*=(U x) noexcept
+        {
+            return *this *= HIP_vector_type{x};
+        }
+
+        __HOST_DEVICE__
+        HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
+        {
+            data /= x.data;
+            return *this;
+        }
+        template<
+            typename U,
+            typename std::enable_if<
+                std::is_convertible<U, T>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator/=(U x) noexcept
+        {
+            return *this /= HIP_vector_type{x};
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type operator-() const noexcept
+        {
+            auto tmp(*this);
+            tmp.data = -tmp.data;
+            return tmp;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type operator~() const noexcept
+        {
+            HIP_vector_type r{*this};
+            r.data = ~r.data;
+            return r;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
+        {
+            data %= x.data;
+            return *this;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
+        {
+            data ^= x.data;
+            return *this;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
+        {
+            data |= x.data;
+            return *this;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
+        {
+            data &= x.data;
+            return *this;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
+        {
+            data >>= x.data;
+            return *this;
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        __HOST_DEVICE__
+        HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
+        {
+            data <<= x.data;
+            return *this;
+        }
+    };
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator+(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} += y;
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator+(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator+(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} += y;
+    }
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator-(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} -= y;
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator-(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator-(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} -= y;
+    }
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator*(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} *= y;
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator*(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator*(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} *= y;
+    }
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator/(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} /= y;
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator/(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator/(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} /= y;
+    }
+
+    template<typename V>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool _hip_any_zero(const V& x, int n) noexcept
+    {
+        return
+            (n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1));
+    }
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator==(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return _hip_any_zero(x.data == y.data, n - 1);
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return x == HIP_vector_type<T, n>{y};
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} == y;
+    }
+
+    template<typename T, unsigned int n>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator!=(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return !(x == y);
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return !(x == y);
+    }
+    template<typename T, unsigned int n, typename U>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return !(x == y);
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator%(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} %= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator%(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} %= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator%(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} %= y;
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator^(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} ^= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator^(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} ^= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator^(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} ^= y;
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator|(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} |= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator|(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} |= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator|(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} |= y;
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator&(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} &= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator&(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} &= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator&(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} &= y;
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator>>(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} >>= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator>>(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} >>= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator>>(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} >>= y;
+    }
+
+    template<
+        typename T,
+        unsigned int n,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator<<(
+        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} <<= y;
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator<<(
+        const HIP_vector_type<T, n>& x, U y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} <<= HIP_vector_type<T, n>{y};
+    }
+    template<
+        typename T,
+        unsigned int n,
+        typename U,
+        typename std::enable_if<std::is_arithmetic<U>::value>::type,
+        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+    __HOST_DEVICE__
+    inline
+    constexpr
+    HIP_vector_type<T, n> operator<<(
+        U x, const HIP_vector_type<T, n>& y) noexcept
+    {
+        return HIP_vector_type<T, n>{x} <<= y;
+    }
+
+    #define __MAKE_VECTOR_TYPE__(CUDA_name, T) \
+        using CUDA_name##1 = HIP_vector_type<T, 1>;\
+        using CUDA_name##2 = HIP_vector_type<T, 2>;\
+        using CUDA_name##3 = HIP_vector_type<T, 3>;\
+        using CUDA_name##4 = HIP_vector_type<T, 4>;
+#else
+    #define __MAKE_VECTOR_TYPE__(CUDA_name, T) \
+        typedef struct {\
+            T x;\
+        } CUDA_name##1;\
+        typedef struct {\
+            T x;\
+            T y;\
+        } CUDA_name##2;\
+        typedef struct {\
+            T x;\
+            T y;\
+            T z;\
+        } CUDA_name##3;\
+        typedef struct {\
+            T x;\
+            T y;\
+            T z;\
+            T w;\
+        } CUDA_name##4;
+#endif
+
+__MAKE_VECTOR_TYPE__(uchar, unsigned char);
+__MAKE_VECTOR_TYPE__(char, char);
+__MAKE_VECTOR_TYPE__(ushort, unsigned short);
+__MAKE_VECTOR_TYPE__(short, short);
+__MAKE_VECTOR_TYPE__(uint, unsigned int);
+__MAKE_VECTOR_TYPE__(int, int);
+__MAKE_VECTOR_TYPE__(ulong, unsigned long);
+__MAKE_VECTOR_TYPE__(long, long);
+__MAKE_VECTOR_TYPE__(ulonglong, unsigned long long);
+__MAKE_VECTOR_TYPE__(longlong, long long);
+__MAKE_VECTOR_TYPE__(float, float);
+__MAKE_VECTOR_TYPE__(double, double);
+
+#ifdef __cplusplus
+#define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
+    static inline __HOST_DEVICE__ \
+    type make_##type(comp x) { type r{x}; return r; }
+
+#define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
+    static inline __HOST_DEVICE__ \
+    type make_##type(comp x, comp y) { type r{x, y}; return r; }
+
+#define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
+    static inline __HOST_DEVICE__ \
+    type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }
+
+#define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
+    static inline __HOST_DEVICE__ \
+    type make_##type(comp x, comp y, comp z, comp w) { \
+        type r{x, y, z, w}; \
+        return r; \
+    }
+#else
+ #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
+     static inline __HOST_DEVICE__ \
+     type make_##type(comp x) { type r; r.x =x; return r; }
+
+ #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
+     static inline __HOST_DEVICE__ \
+     type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }
+
+ #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
+     static inline __HOST_DEVICE__ \
+     type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }
+
+ #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
+     static inline __HOST_DEVICE__ \
+     type make_##type(comp x, comp y, comp z, comp w) { \
+         type r; r.x=x; r.y=y; r.z=z; r.w=w; \
+         return r; \
+     }
+#endif
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned char, uchar4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed char, char1);
+DECLOP_MAKE_TWO_COMPONENT(signed char, char2);
+DECLOP_MAKE_THREE_COMPONENT(signed char, char3);
+DECLOP_MAKE_FOUR_COMPONENT(signed char, char4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned short, ushort1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned short, ushort2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned short, ushort3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned short, ushort4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed short, short1);
+DECLOP_MAKE_TWO_COMPONENT(signed short, short2);
+DECLOP_MAKE_THREE_COMPONENT(signed short, short3);
+DECLOP_MAKE_FOUR_COMPONENT(signed short, short4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned int, uint1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned int, uint2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned int, uint3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned int, uint4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed int, int1);
+DECLOP_MAKE_TWO_COMPONENT(signed int, int2);
+DECLOP_MAKE_THREE_COMPONENT(signed int, int3);
+DECLOP_MAKE_FOUR_COMPONENT(signed int, int4);
+
+DECLOP_MAKE_ONE_COMPONENT(float, float1);
+DECLOP_MAKE_TWO_COMPONENT(float, float2);
+DECLOP_MAKE_THREE_COMPONENT(float, float3);
+DECLOP_MAKE_FOUR_COMPONENT(float, float4);
+
+DECLOP_MAKE_ONE_COMPONENT(double, double1);
+DECLOP_MAKE_TWO_COMPONENT(double, double2);
+DECLOP_MAKE_THREE_COMPONENT(double, double3);
+DECLOP_MAKE_FOUR_COMPONENT(double, double4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long, ulong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long, ulong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long, ulong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long, ulong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long, long1);
+DECLOP_MAKE_TWO_COMPONENT(signed long, long2);
+DECLOP_MAKE_THREE_COMPONENT(signed long, long3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long, long4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long long, ulonglong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long long, ulonglong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long long, ulonglong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long long, ulonglong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
+DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
+DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
+#else // !defined(__has_attribute)
+
+#if defined(_MSC_VER)
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+
+typedef union { char data; } char1;
+typedef union { char data[2]; } char2;
+typedef union { char data[4]; } char4;
+typedef union { char4 data; } char3;
+typedef union { __m64 data; } char8;
+typedef union { __m128i data; } char16;
+
+typedef union { unsigned char data; } uchar1;
+typedef union { unsigned char data[2]; } uchar2;
+typedef union { unsigned char data[4]; } uchar4;
+typedef union { uchar4 data; } uchar3;
+typedef union { __m64 data; } uchar8;
+typedef union { __m128i data; } uchar16;
+
+typedef union { short data; } short1;
+typedef union { short data[2]; } short2;
+typedef union { __m64 data; } short4;
+typedef union { short4 data; } short3;
+typedef union { __m128i data; } short8;
+typedef union { __m128i data[2]; } short16;
+
+typedef union { unsigned short data; } ushort1;
+typedef union { unsigned short data[2]; } ushort2;
+typedef union { __m64 data; } ushort4;
+typedef union { ushort4 data; } ushort3;
+typedef union { __m128i data; } ushort8;
+typedef union { __m128i data[2]; } ushort16;
+
+typedef union { int data; } int1;
+typedef union { __m64 data; } int2;
+typedef union { __m128i data; } int4;
+typedef union { int4 data; } int3;
+typedef union { __m128i data[2]; } int8;
+typedef union { __m128i data[4];} int16;
+
+typedef union { unsigned int data; } uint1;
+typedef union { __m64 data; } uint2;
+typedef union { __m128i data; } uint4;
+typedef union { uint4 data; } uint3;
+typedef union { __m128i data[2]; } uint8;
+typedef union { __m128i data[4]; } uint16;
+
+#if !defined(_WIN64)
+typedef union { int data; } long1;
+typedef union { __m64 data; } long2;
+typedef union { __m128i data; } long4;
+typedef union { long4 data; } long3;
+typedef union { __m128i data[2]; } long8;
+typedef union { __m128i data[4]; } long16;
+
+typedef union { unsigned int data; } ulong1;
+typedef union { __m64 data; } ulong2;
+typedef union { __m128i data; } ulong4;
+typedef union { ulong4 data; } ulong3;
+typedef union { __m128i data[2]; } ulong8;
+typedef union { __m128i data[4]; } ulong16;
+#else // defined(_WIN64)
+typedef union { __m64 data; } long1;
+typedef union { __m128i data; } long2;
+typedef union { __m128i data[2]; } long4;
+typedef union { long4 data; } long3;
+typedef union { __m128i data[4]; } long8;
+typedef union { __m128i data[8]; } long16;
+
+typedef union { __m64 data; } ulong1;
+typedef union { __m128i data; } ulong2;
+typedef union { __m128i data[2]; } ulong4;
+typedef union { ulong4 data; } ulong3;
+typedef union { __m128i data[4]; } ulong8;
+typedef union { __m128i data[8]; } ulong16;
+#endif // defined(_WIN64)
+
+typedef union { __m64 data; } longlong1;
+typedef union { __m128i data; } longlong2;
+typedef union { __m128i data[2]; } longlong4;
+typedef union { longlong4 data; } longlong3;
+typedef union { __m128i data[4]; } longlong8;
+typedef union { __m128i data[8]; } longlong16;
+
+typedef union { __m64 data; } ulonglong1;
+typedef union { __m128i data; } ulonglong2;
+typedef union { __m128i data[2]; } ulonglong4;
+typedef union { ulonglong4 data; } ulonglong3;
+typedef union { __m128i data[4]; } ulonglong8;
+typedef union { __m128i data[8]; } ulonglong16;
+
+typedef union { float data; } float1;
+typedef union { __m64 data; } float2;
+typedef union { __m128 data; } float4;
+typedef union { float4 data; } float3;
+typedef union { __m256 data; } float8;
+typedef union { __m256 data[2]; } float16;
+
+typedef union { double data; } double1;
+typedef union { __m128d data; } double2;
+typedef union { __m256d data; } double4;
+typedef union { double4 data; } double3;
+typedef union { __m256d data[2]; } double8;
+typedef union { __m256d data[4]; } double16;
+
+#else // !defined(_MSC_VER)
+
+typedef union { char data; } char1;
+typedef union { char data[2]; } char2;
+typedef union { char data[4]; } char4;
+typedef union { char data[8]; } char8;
+typedef union { char data[16]; } char16;
+typedef union { char4 data; } char3;
+
+typedef union { unsigned char data; } uchar1;
+typedef union { unsigned char data[2]; } uchar2;
+typedef union { unsigned char data[4]; } uchar4;
+typedef union { unsigned char data[8]; } uchar8;
+typedef union { unsigned char data[16]; } uchar16;
+typedef union { uchar4 data; } uchar3;
+
+typedef union { short data; } short1;
+typedef union { short data[2]; } short2;
+typedef union { short data[4]; } short4;
+typedef union { short data[8]; } short8;
+typedef union { short data[16]; } short16;
+typedef union { short4 data; } short3;
+
+typedef union { unsigned short data; } ushort1;
+typedef union { unsigned short data[2]; } ushort2;
+typedef union { unsigned short data[4]; } ushort4;
+typedef union { unsigned short data[8]; } ushort8;
+typedef union { unsigned short data[16]; } ushort16;
+typedef union { ushort4 data; } ushort3;
+
+typedef union { int data; } int1;
+typedef union { int data[2]; } int2;
+typedef union { int data[4]; } int4;
+typedef union { int data[8]; } int8;
+typedef union { int data[16]; } int16;
+typedef union { int4 data; } int3;
+
+typedef union { unsigned int data; } uint1;
+typedef union { unsigned int data[2]; } uint2;
+typedef union { unsigned int data[4]; } uint4;
+typedef union { unsigned int data[8]; } uint8;
+typedef union { unsigned int data[16]; } uint16;
+typedef union { uint4 data; } uint3;
+
+typedef union { long data; } long1;
+typedef union { long data[2]; } long2;
+typedef union { long data[4]; } long4;
+typedef union { long data[8]; } long8;
+typedef union { long data[16]; } long16;
+typedef union { long4 data; } long3;
+
+typedef union { unsigned long data; } ulong1;
+typedef union { unsigned long data[2]; } ulong2;
+typedef union { unsigned long data[4]; } ulong4;
+typedef union { unsigned long data[8]; } ulong8;
+typedef union { unsigned long data[16]; } ulong16;
+typedef union { ulong4 data; } ulong3;
+
+typedef union { long long data; } longlong1;
+typedef union { long long data[2]; } longlong2;
+typedef union { long long data[4]; } longlong4;
+typedef union { long long data[8]; } longlong8;
+typedef union { long long data[16]; } longlong16;
+typedef union { longlong4 data; } longlong3;
+
+typedef union { unsigned long long data; } ulonglong1;
+typedef union { unsigned long long data[2]; } ulonglong2;
+typedef union { unsigned long long data[4]; } ulonglong4;
+typedef union { unsigned long long data[8]; } ulonglong8;
+typedef union { unsigned long long data[16]; } ulonglong16;
+typedef union { ulonglong4 data; } ulonglong3;
+
+typedef union { float data; } float1;
+typedef union { float data[2]; } float2;
+typedef union { float data[4]; } float4;
+typedef union { float data[8]; } float8;
+typedef union { float data[16]; } float16;
+typedef union { float4 data; } float3;
+
+typedef union { double data; } double1;
+typedef union { double data[2]; } double2;
+typedef union { double data[4]; } double4;
+typedef union { double data[8]; } double8;
+typedef union { double data[16]; } double16;
+typedef union { double4 data; } double3;
+
+#endif // defined(_MSC_VER)
+#endif // defined(__has_attribute)
+#endif
diff --git a/include/hip/amd_detail/hiprtc.h b/include/hip/amd_detail/hiprtc.h
new file mode 100644
index 0000000000..fecea75340
--- /dev/null
+++ b/include/hip/amd_detail/hiprtc.h
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push (default)
+#endif
+
+enum hiprtcResult {
+    HIPRTC_SUCCESS = 0,
+    HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+    HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+    HIPRTC_ERROR_INVALID_INPUT = 3,
+    HIPRTC_ERROR_INVALID_PROGRAM = 4,
+    HIPRTC_ERROR_INVALID_OPTION = 5,
+    HIPRTC_ERROR_COMPILATION = 6,
+    HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+    HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+    HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+    HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+    HIPRTC_ERROR_INTERNAL_ERROR = 11
+};
+
+const char* hiprtcGetErrorString(hiprtcResult result);
+
+
+hiprtcResult hiprtcVersion(int* major, int* minor);
+
+typedef struct _hiprtcProgram* hiprtcProgram;
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog,
+                                     const char* name_expression);
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog,
+                                  int numOptions,
+                                  const char** options);
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog,
+                                 const char* src,
+                                 const char* name,
+                                 int numHeaders,
+                                 const char** headers,
+                                 const char** includeNames);
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog);
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog,
+                                  const char* name_expression,
+                                  const char** lowered_name);
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog,
+                                     size_t* logSizeRet);
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif //HIPRTC_H
diff --git a/include/hip/amd_detail/host_defines.h b/include/hip/amd_detail/host_defines.h
new file mode 100644
index 0000000000..7f1075acc1
--- /dev/null
+++ b/include/hip/amd_detail/host_defines.h
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/host_defines.h
+ *  @brief TODO-doc
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+
+// The follow macro should be removed after upstream updation.
+// It's defined here for workarround of rocThrust building failure.
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
+
+// Add guard to Generic Grid Launch method
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#if defined(__clang__) && defined(__HIP__)
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#define __noinline__ __attribute__((noinline))
+#define __forceinline__ inline __attribute__((always_inline))
+
+#else
+
+// Non-HCC compiler
+/**
+ * Function and kernel markers
+ */
+#define __host__
+#define __device__
+
+#define __global__
+
+#define __noinline__
+#define __forceinline__ inline
+
+#define __shared__
+#define __constant__
+
+#endif
+
+#endif
diff --git a/include/hip/amd_detail/hsa_helpers.hpp b/include/hip/amd_detail/hsa_helpers.hpp
new file mode 100644
index 0000000000..af4f0c93ab
--- /dev/null
+++ b/include/hip/amd_detail/hsa_helpers.hpp
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+namespace hip_impl {
+inline void* address(hsa_executable_symbol_t x) {
+    void* r = nullptr;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
+
+    return r;
+}
+
+inline hsa_agent_t agent(hsa_executable_symbol_t x) {
+    hsa_agent_t r = {};
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
+
+    return r;
+}
+
+inline std::uint32_t group_size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
+
+    return r;
+}
+
+inline hsa_isa_t isa(hsa_agent_t x) {
+    hsa_isa_t r = {};
+    hsa_agent_iterate_isas(x,
+                           [](hsa_isa_t i, void* o) {
+                               *static_cast<hsa_isa_t*>(o) = i;  // Pick the first.
+
+                               return HSA_STATUS_INFO_BREAK;
+                           },
+                           &r);
+
+    return r;
+}
+
+inline std::uint64_t kernel_object(hsa_executable_symbol_t x) {
+    std::uint64_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
+
+    return r;
+}
+
+inline std::string name(hsa_executable_symbol_t x) {
+    std::uint32_t sz = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
+
+    std::string r(sz, '\0');
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
+
+    return r;
+}
+
+inline std::uint32_t private_size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
+
+    return r;
+}
+
+inline std::uint32_t size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
+
+    return r;
+}
+
+inline hsa_symbol_kind_t type(hsa_executable_symbol_t x) {
+    hsa_symbol_kind_t r = {};
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
+
+    return r;
+}
+}  // namespace hip_impl
\ No newline at end of file
diff --git a/include/hip/amd_detail/library_types.h b/include/hip/amd_detail/library_types.h
new file mode 100644
index 0000000000..6c3e111220
--- /dev/null
+++ b/include/hip/amd_detail/library_types.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_LIBRARY_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_LIBRARY_TYPES_H
+
+typedef enum hipDataType {
+  HIP_R_16F = 2,
+  HIP_R_32F = 0,
+  HIP_R_64F = 1,
+  HIP_C_16F = 6,
+  HIP_C_32F = 4,
+  HIP_C_64F = 5
+} hipDataType;
+
+typedef enum hipLibraryPropertyType {
+  HIP_LIBRARY_MAJOR_VERSION,
+  HIP_LIBRARY_MINOR_VERSION,
+  HIP_LIBRARY_PATCH_LEVEL
+} hipLibraryPropertyType;
+
+#endif
diff --git a/include/hip/amd_detail/llvm_intrinsics.h b/include/hip/amd_detail/llvm_intrinsics.h
new file mode 100644
index 0000000000..c100b846c0
--- /dev/null
+++ b/include/hip/amd_detail/llvm_intrinsics.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/llvm_intrinsics.h
+ *  @brief Contains declarations for wrapper functions for llvm intrinsics
+ *         like llvm.amdgcn.s.barrier.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_LLVM_INTRINSICS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_LLVM_INTRINSICS_H
+
+#include "hip/amd_detail/host_defines.h"
+
+// FIXME: These should all be removed and proper builtins used.
+__device__
+unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
+
+__device__
+int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");
+
+#endif
diff --git a/include/hip/amd_detail/macro_based_grid_launch.hpp b/include/hip/amd_detail/macro_based_grid_launch.hpp
new file mode 100644
index 0000000000..96d449b213
--- /dev/null
+++ b/include/hip/amd_detail/macro_based_grid_launch.hpp
@@ -0,0 +1,798 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+
+#include "hc.hpp"
+#include "hip/hip_ext.h"
+#include "hip_runtime.h"
+
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace hip_impl {
+namespace {
+struct New_grid_launch_tag {};
+struct Old_grid_launch_tag {};
+
+template <typename C, typename D>
+class RAII_guard {
+    D dtor_;
+
+   public:
+    RAII_guard() = default;
+
+    RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} { ctor(); }
+
+    RAII_guard(const RAII_guard&) = default;
+    RAII_guard(RAII_guard&&) = default;
+
+    RAII_guard& operator=(const RAII_guard&) = default;
+    RAII_guard& operator=(RAII_guard&&) = default;
+
+    ~RAII_guard() { dtor_(); }
+};
+
+template <typename C, typename D>
+RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
+    return RAII_guard<C, D>{ctor, std::move(dtor)};
+}
+
+template <FunctionalProcedure F, typename... Ts>
+using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
+                                                       Old_grid_launch_tag>::type;
+}  // namespace
+
+// TODO: - dispatch rank should be derived from the domain dimensions passed
+//         in, and not always assumed to be 3;
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+         {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks,
+                                                    dim3 dim_blocks, int group_mem_bytes,
+                                                    const hc::accelerator_view& acc_v, K k) {
+    const auto d =
+        hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
+                      num_blocks.x * dim_blocks.x}
+            .tile_with_dynamic(dim_blocks.z, dim_blocks.y, dim_blocks.x, group_mem_bytes);
+
+    try {
+        hc::parallel_for_each(acc_v, d, k);
+    } catch (std::exception& ex) {
+        std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+        hip_throw(ex);
+    }
+}
+
+// TODO: these are workarounds, they should be removed.
+
+hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
+void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
+void unlock_stream_hip_(hipStream_t, void*, const char*, hc::accelerator_view*);
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag,
+                                                                 dim3 num_blocks, dim3 dim_blocks,
+                                                                 int group_mem_bytes,
+                                                                 hipStream_t stream,
+                                                                 const char* kernel_name, K k) {
+    void* lck_stream = nullptr;
+    auto acc_v = lock_stream_hip_(stream, lck_stream);
+    auto stream_guard =
+        make_RAII_guard(std::bind(print_prelaunch_trace_, kernel_name, num_blocks, dim_blocks,
+                                  group_mem_bytes, stream),
+                        std::bind(unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
+
+    try {
+        grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                              group_mem_bytes, acc_v, std::move(k));
+    } catch (std::exception& ex) {
+        std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+        hip_throw(ex);
+    }
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+         {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(Old_grid_launch_tag,
+                                                                   dim3 num_blocks, dim3 dim_blocks,
+                                                                   int group_mem_bytes,
+                                                                   hipStream_t stream, K k) {
+    grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                          group_mem_bytes, std::move(stream), std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(
+    Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
+    const char* kernel_name, K k) {
+    grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                          group_mem_bytes, std::move(stream), kernel_name, std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+                                                  int group_mem_bytes, hipStream_t stream,
+                                                  const char* kernel_name, K k) {
+    grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+                          std::move(dim_blocks), group_mem_bytes, std::move(stream), kernel_name,
+                          std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+                                                  int group_mem_bytes, hipStream_t stream, K k) {
+    grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+                          std::move(dim_blocks), group_mem_bytes, std::move(stream), std::move(k));
+}
+
+// TODO: these are temporary and purposefully noisy and disruptive.
+#define make_kernel_name_hip(k, n)                                                                 \
+    HIP_kernel_functor_name_begin##_##k##_##HIP_kernel_functor_name_end##_##n
+
+#define make_kernel_functor_hip_30(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25, p26, p27)                                   \
+    struct make_kernel_name_hip(function_name, 28) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        std::decay_t<decltype(p26)> _p26_;                                                         \
+        std::decay_t<decltype(p27)> _p27_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_, _p26_, _p27_);                                 \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_29(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25, p26)                                        \
+    struct make_kernel_name_hip(function_name, 27) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        std::decay_t<decltype(p26)> _p26_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_, _p26_);                                        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_28(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25)                                             \
+    struct make_kernel_name_hip(function_name, 26) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_);                                               \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_27(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24)                                                  \
+    struct make_kernel_name_hip(function_name, 25) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_);                                                      \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_26(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23)                                                       \
+    struct make_kernel_name_hip(function_name, 24) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_);                                                             \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_25(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22)                                                            \
+    struct make_kernel_name_hip(function_name, 23) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        __attribute__((used, flatten)) void operator()(const hc::tiled_index<3>&) const [[hc]] {   \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_);                                                                    \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_24(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21) \
+    struct make_kernel_name_hip(function_name, 22) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_);     \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_23(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)      \
+    struct make_kernel_name_hip(function_name, 21) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_);            \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_22(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)           \
+    struct make_kernel_name_hip(function_name, 20) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_);                   \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_21(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18)                \
+    struct make_kernel_name_hip(function_name, 19) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_);                          \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_20(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17)                     \
+    struct make_kernel_name_hip(function_name, 18) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);                                 \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_19(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16)                          \
+    struct make_kernel_name_hip(function_name, 17) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_);                                        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_18(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15)                               \
+    struct make_kernel_name_hip(function_name, 16) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_);                                               \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_17(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14)                                    \
+    struct make_kernel_name_hip(function_name, 15) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_);                                                      \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_16(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13)                                         \
+    struct make_kernel_name_hip(function_name, 14) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_);                                                             \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_15(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12)                                              \
+    struct make_kernel_name_hip(function_name, 13) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_);                                                                    \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_14(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11)                                                   \
+    struct make_kernel_name_hip(function_name, 12) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_); \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_13(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10)                                                        \
+    struct make_kernel_name_hip(function_name, 11) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_);        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_12(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9)                                                             \
+    struct make_kernel_name_hip(function_name, 10) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_); }    \
+    }
+#define make_kernel_functor_hip_11(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8) \
+    struct make_kernel_name_hip(function_name, 9) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_); }          \
+    }
+#define make_kernel_functor_hip_10(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)     \
+    struct make_kernel_name_hip(function_name, 8) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_); }                \
+    }
+#define make_kernel_functor_hip_9(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)          \
+    struct make_kernel_name_hip(function_name, 7) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_); }                      \
+    }
+#define make_kernel_functor_hip_8(function_name, kernel_name, p0, p1, p2, p3, p4, p5)              \
+    struct make_kernel_name_hip(function_name, 6) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_); }                            \
+    }
+#define make_kernel_functor_hip_7(function_name, kernel_name, p0, p1, p2, p3, p4)                  \
+    struct make_kernel_name_hip(function_name, 5) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_); }                                  \
+    }
+#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)                      \
+    struct make_kernel_name_hip(function_name, 4) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_); }                                        \
+    }
+#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)                          \
+    struct make_kernel_name_hip(function_name, 3) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_, _p2_); } \
+    }
+#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)                              \
+    struct make_kernel_name_hip(function_name, 2) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_); }       \
+    }
+#define fofo(f, n) kernel_prefix_hip##f##kernel_suffix_hip##n
+#define make_kernel_functor_hip_3(function_name, kernel_name, p0)                                  \
+    struct make_kernel_name_hip(function_name, 1) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_); }             \
+    }
+#define make_kernel_functor_hip_2(function_name, kernel_name)                                      \
+    struct make_kernel_name_hip(function_name, 0) {                                                \
+        void operator()(const hc::tiled_index<3>&)[[hc]] { return kernel_name(hipLaunchParm{}); }  \
+    }
+#define make_kernel_functor_hip_1(...)
+#define make_kernel_functor_hip_0(...)
+#define make_kernel_functor_hip_(...) overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
+
+
+#define hipLaunchNamedKernelGGL(function_name, kernel_name, num_blocks, dim_blocks,                \
+                                group_mem_bytes, stream, ...)                                      \
+    do {                                                                                           \
+        make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)                          \
+            hip_kernel_functor_impl_{__VA_ARGS__};                                                 \
+        hip_impl::grid_launch_hip_(num_blocks, dim_blocks, group_mem_bytes, stream, #kernel_name,  \
+                                   hip_kernel_functor_impl_);                                      \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)      \
+    do {                                                                                           \
+        hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes,     \
+                                stream, ##__VA_ARGS__);                                            \
+    } while (0)
+
+#define hipLaunchKernel(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)         \
+    do {                                                                                           \
+        hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream,           \
+                           hipLaunchParm{}, ##__VA_ARGS__);                                        \
+    } while (0)
+}  // namespace hip_impl
diff --git a/include/hip/amd_detail/math_functions.h b/include/hip/amd_detail/math_functions.h
new file mode 100644
index 0000000000..2cbee4829a
--- /dev/null
+++ b/include/hip/amd_detail/math_functions.h
@@ -0,0 +1,1502 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "hip_fp16_math_fwd.h"
+#include "hip_vector_types.h"
+#include "math_fwd.h"
+
+#include <hip/amd_detail/host_defines.h>
+
+#if !defined(__HIPCC_RTC__)
+#include <algorithm>
+// assert.h is only for the host version of assert.
+// The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
+// Users should include hip_runtime.h for the device version of assert.
+#if !__HIP_DEVICE_COMPILE__
+#include <assert.h>
+#endif
+#include <limits.h>
+#include <limits>
+#include <stdint.h>
+#endif // !defined(__HIPCC_RTC__)
+
+#if _LIBCPP_VERSION && __HIP__
+namespace std {
+template <>
+struct __numeric_type<_Float16>
+{
+   static _Float16 __test(_Float16);
+
+   typedef _Float16 type;
+   static const bool value = true;
+};
+}
+#endif // _LIBCPP_VERSION
+
+#pragma push_macro("__DEVICE__")
+#pragma push_macro("__RETURN_TYPE")
+
+#define __DEVICE__ static __device__
+#define __RETURN_TYPE bool
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+__DEVICE__
+inline
+uint64_t __make_mantissa_base8(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
+
+        if (tmp >= '0' && tmp <= '7') r = (r * 8u) + tmp - '0';
+        else return 0;
+
+        ++tagp;
+    }
+
+    return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa_base10(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
+
+        if (tmp >= '0' && tmp <= '9') r = (r * 10u) + tmp - '0';
+        else return 0;
+
+        ++tagp;
+    }
+
+    return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa_base16(const char* tagp)
+{
+    uint64_t r = 0;
+    while (tagp) {
+        char tmp = *tagp;
+
+        if (tmp >= '0' && tmp <= '9') r = (r * 16u) + tmp - '0';
+        else if (tmp >= 'a' && tmp <= 'f') r = (r * 16u) + tmp - 'a' + 10;
+        else if (tmp >= 'A' && tmp <= 'F') r = (r * 16u) + tmp - 'A' + 10;
+        else return 0;
+
+        ++tagp;
+    }
+
+    return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa(const char* tagp)
+{
+    if (!tagp) return 0u;
+
+    if (*tagp == '0') {
+        ++tagp;
+
+        if (*tagp == 'x' || *tagp == 'X') return __make_mantissa_base16(tagp);
+        else return __make_mantissa_base8(tagp);
+    }
+
+    return __make_mantissa_base10(tagp);
+}
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+// DOT FUNCTIONS
+#if __HIP_CLANG_ONLY__
+__DEVICE__
+inline
+int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
+    return __ockl_sdot2(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
+    return __ockl_udot2(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
+    return __ockl_sdot4(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
+    return __ockl_udot4(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+int amd_mixed_dot(int a, int b, int c, bool saturate) {
+    return __ockl_sdot8(a, b, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
+    return __ockl_udot8(a, b, c, saturate);
+}
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__DEVICE__
+inline
+float abs(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline
+float acosf(float x) { return __ocml_acos_f32(x); }
+__DEVICE__
+inline
+float acoshf(float x) { return __ocml_acosh_f32(x); }
+__DEVICE__
+inline
+float asinf(float x) { return __ocml_asin_f32(x); }
+__DEVICE__
+inline
+float asinhf(float x) { return __ocml_asinh_f32(x); }
+__DEVICE__
+inline
+float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
+__DEVICE__
+inline
+float atanf(float x) { return __ocml_atan_f32(x); }
+__DEVICE__
+inline
+float atanhf(float x) { return __ocml_atanh_f32(x); }
+__DEVICE__
+inline
+float cbrtf(float x) { return __ocml_cbrt_f32(x); }
+__DEVICE__
+inline
+float ceilf(float x) { return __ocml_ceil_f32(x); }
+__DEVICE__
+inline
+float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
+__DEVICE__
+inline
+float cosf(float x) { return __ocml_cos_f32(x); }
+__DEVICE__
+inline
+float coshf(float x) { return __ocml_cosh_f32(x); }
+__DEVICE__
+inline
+float cospif(float x) { return __ocml_cospi_f32(x); }
+__DEVICE__
+inline
+float cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+__DEVICE__
+inline
+float cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+__DEVICE__
+inline
+float erfcf(float x) { return __ocml_erfc_f32(x); }
+__DEVICE__
+inline
+float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
+__DEVICE__
+inline
+float erfcxf(float x) { return __ocml_erfcx_f32(x); }
+__DEVICE__
+inline
+float erff(float x) { return __ocml_erf_f32(x); }
+__DEVICE__
+inline
+float erfinvf(float x) { return __ocml_erfinv_f32(x); }
+__DEVICE__
+inline
+float exp10f(float x) { return __ocml_exp10_f32(x); }
+__DEVICE__
+inline
+float exp2f(float x) { return __ocml_exp2_f32(x); }
+__DEVICE__
+inline
+float expf(float x) { return __ocml_exp_f32(x); }
+__DEVICE__
+inline
+float expm1f(float x) { return __ocml_expm1_f32(x); }
+__DEVICE__
+inline
+float fabsf(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline
+float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
+__DEVICE__
+inline
+float fdividef(float x, float y) { return x / y; }
+__DEVICE__
+inline
+float floorf(float x) { return __ocml_floor_f32(x); }
+__DEVICE__
+inline
+float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
+__DEVICE__
+inline
+float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+__DEVICE__
+inline
+float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+__DEVICE__
+inline
+float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+__DEVICE__
+inline
+float frexpf(float x, int* nptr)
+{
+    int tmp;
+    float r =
+        __ocml_frexp_f32(x, (__attribute__((address_space(5))) int*) &tmp);
+    *nptr = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
+__DEVICE__
+inline
+int ilogbf(float x) { return __ocml_ilogb_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isfinite(float x) { return __ocml_isfinite_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isinf(float x) { return __ocml_isinf_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isnan(float x) { return __ocml_isnan_f32(x); }
+__DEVICE__
+inline
+float j0f(float x) { return __ocml_j0_f32(x); }
+__DEVICE__
+inline
+float j1f(float x) { return __ocml_j1_f32(x); }
+__DEVICE__
+inline
+float jnf(int n, float x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    float x0 = j0f(x);
+    float x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        float x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+__DEVICE__
+inline
+float ldexpf(float x, int e) { return __ocml_ldexp_f32(x, e); }
+__DEVICE__
+inline
+float lgammaf(float x) { return __ocml_lgamma_f32(x); }
+__DEVICE__
+inline
+long long int llrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+long long int llroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float log10f(float x) { return __ocml_log10_f32(x); }
+__DEVICE__
+inline
+float log1pf(float x) { return __ocml_log1p_f32(x); }
+__DEVICE__
+inline
+float log2f(float x) { return __ocml_log2_f32(x); }
+__DEVICE__
+inline
+float logbf(float x) { return __ocml_logb_f32(x); }
+__DEVICE__
+inline
+float logf(float x) { return __ocml_log_f32(x); }
+__DEVICE__
+inline
+long int lrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+long int lroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float modff(float x, float* iptr)
+{
+    float tmp;
+    float r =
+        __ocml_modf_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *iptr = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+float nanf(const char* tagp)
+{
+    union {
+        float val;
+        struct ieee_float {
+            uint32_t mantissa : 22;
+            uint32_t quiet : 1;
+            uint32_t exponent : 8;
+            uint32_t sign : 1;
+        } bits;
+
+        static_assert(sizeof(float) == sizeof(ieee_float), "");
+    } tmp;
+
+    tmp.bits.sign = 0u;
+    tmp.bits.exponent = ~0u;
+    tmp.bits.quiet = 1u;
+    tmp.bits.mantissa = __make_mantissa(tagp);
+
+    return tmp.val;
+}
+__DEVICE__
+inline
+float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
+__DEVICE__
+inline
+float nextafterf(float x, float y) { return __ocml_nextafter_f32(x, y); }
+__DEVICE__
+inline
+float norm3df(float x, float y, float z) { return __ocml_len3_f32(x, y, z); }
+__DEVICE__
+inline
+float norm4df(float x, float y, float z, float w)
+{
+    return __ocml_len4_f32(x, y, z, w);
+}
+__DEVICE__
+inline
+float normcdff(float x) { return __ocml_ncdf_f32(x); }
+__DEVICE__
+inline
+float normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+__DEVICE__
+inline
+float normf(int dim, const float* a)
+{   // TODO: placeholder until OCML adds support.
+    float r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_sqrt_f32(r);
+}
+__DEVICE__
+inline
+float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline
+float powif(float base, int iexp) { return __ocml_pown_f32(base, iexp); }
+__DEVICE__
+inline
+float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
+__DEVICE__
+inline
+float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
+__DEVICE__
+inline
+float remquof(float x, float y, int* quo)
+{
+    int tmp;
+    float r =
+        __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int*) &tmp);
+    *quo = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+float rhypotf(float x, float y) { return __ocml_rhypot_f32(x, y); }
+__DEVICE__
+inline
+float rintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+float rnorm3df(float x, float y, float z)
+{
+    return __ocml_rlen3_f32(x, y, z);
+}
+
+__DEVICE__
+inline
+float rnorm4df(float x, float y, float z, float w)
+{
+    return __ocml_rlen4_f32(x, y, z, w);
+}
+__DEVICE__
+inline
+float rnormf(int dim, const float* a)
+{   // TODO: placeholder until OCML adds support.
+    float r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_rsqrt_f32(r);
+}
+__DEVICE__
+inline
+float roundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
+__DEVICE__
+inline
+float scalblnf(float x, long int n)
+{
+    return (n < INT_MAX) ? __ocml_scalbn_f32(x, n) : __ocml_scalb_f32(x, n);
+}
+__DEVICE__
+inline
+float scalbnf(float x, int n) { return __ocml_scalbn_f32(x, n); }
+__DEVICE__
+inline
+__RETURN_TYPE signbit(float x) { return __ocml_signbit_f32(x); }
+__DEVICE__
+inline
+void sincosf(float x, float* sptr, float* cptr)
+{
+    float tmp;
+
+    *sptr =
+        __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *cptr = tmp;
+}
+__DEVICE__
+inline
+void sincospif(float x, float* sptr, float* cptr)
+{
+    float tmp;
+
+    *sptr =
+        __ocml_sincospi_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+    *cptr = tmp;
+}
+__DEVICE__
+inline
+float sinf(float x) { return __ocml_sin_f32(x); }
+__DEVICE__
+inline
+float sinhf(float x) { return __ocml_sinh_f32(x); }
+__DEVICE__
+inline
+float sinpif(float x) { return __ocml_sinpi_f32(x); }
+__DEVICE__
+inline
+float sqrtf(float x) { return __ocml_sqrt_f32(x); }
+__DEVICE__
+inline
+float tanf(float x) { return __ocml_tan_f32(x); }
+__DEVICE__
+inline
+float tanhf(float x) { return __ocml_tanh_f32(x); }
+__DEVICE__
+inline
+float tgammaf(float x) { return __ocml_tgamma_f32(x); }
+__DEVICE__
+inline
+float truncf(float x) { return __ocml_trunc_f32(x); }
+__DEVICE__
+inline
+float y0f(float x) { return __ocml_y0_f32(x); }
+__DEVICE__
+inline
+float y1f(float x) { return __ocml_y1_f32(x); }
+__DEVICE__
+inline
+float ynf(int n, float x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case. Placeholder until OCML adds
+    //       support.
+    if (n == 0) return y0f(x);
+    if (n == 1) return y1f(x);
+
+    float x0 = y0f(x);
+    float x1 = y1f(x);
+    for (int i = 1; i < n; ++i) {
+        float x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+
+// BEGIN INTRINSICS
+__DEVICE__
+inline
+float __cosf(float x) { return __ocml_native_cos_f32(x); }
+__DEVICE__
+inline
+float __exp10f(float x) { return __ocml_native_exp10_f32(x); }
+__DEVICE__
+inline
+float __expf(float x) { return __ocml_native_exp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fadd_rn(float x, float y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fdiv_rn(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fdividef(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmaf_rd(float x, float y, float z)
+{
+    return __ocml_fma_rtn_f32(x, y, z);
+}
+#endif
+__DEVICE__
+inline
+float __fmaf_rn(float x, float y, float z)
+{
+    return __ocml_fma_f32(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmaf_ru(float x, float y, float z)
+{
+    return __ocml_fma_rtp_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmaf_rz(float x, float y, float z)
+{
+   return __ocml_fma_rtz_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fmul_rn(float x, float y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+__DEVICE__
+inline
+float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline
+float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__DEVICE__
+inline
+float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline
+float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
+#endif
+__DEVICE__
+inline
+float __fsqrt_rn(float x) { return __ocml_native_sqrt_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
+__DEVICE__
+inline
+float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
+__DEVICE__
+inline
+float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fsub_rn(float x, float y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __log10f(float x) { return __ocml_native_log10_f32(x); }
+__DEVICE__
+inline
+float __log2f(float x) { return __ocml_native_log2_f32(x); }
+__DEVICE__
+inline
+float __logf(float x) { return __ocml_native_log_f32(x); }
+__DEVICE__
+inline
+float __powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline
+float __saturatef(float x) { return (x < 0) ? 0 : ((x > 1) ? 1 : x); }
+__DEVICE__
+inline
+void __sincosf(float x, float* sptr, float* cptr)
+{
+    *sptr = __ocml_native_sin_f32(x);
+    *cptr = __ocml_native_cos_f32(x);
+}
+__DEVICE__
+inline
+float __sinf(float x) { return __ocml_native_sin_f32(x); }
+__DEVICE__
+inline
+float __tanf(float x) { return __ocml_tan_f32(x); }
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__DEVICE__
+inline
+double abs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline
+double acos(double x) { return __ocml_acos_f64(x); }
+__DEVICE__
+inline
+double acosh(double x) { return __ocml_acosh_f64(x); }
+__DEVICE__
+inline
+double asin(double x) { return __ocml_asin_f64(x); }
+__DEVICE__
+inline
+double asinh(double x) { return __ocml_asinh_f64(x); }
+__DEVICE__
+inline
+double atan(double x) { return __ocml_atan_f64(x); }
+__DEVICE__
+inline
+double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
+__DEVICE__
+inline
+double atanh(double x) { return __ocml_atanh_f64(x); }
+__DEVICE__
+inline
+double cbrt(double x) { return __ocml_cbrt_f64(x); }
+__DEVICE__
+inline
+double ceil(double x) { return __ocml_ceil_f64(x); }
+__DEVICE__
+inline
+double copysign(double x, double y) { return __ocml_copysign_f64(x, y); }
+__DEVICE__
+inline
+double cos(double x)  { return __ocml_cos_f64(x); }
+__DEVICE__
+inline
+double cosh(double x) { return __ocml_cosh_f64(x); }
+__DEVICE__
+inline
+double cospi(double x) { return __ocml_cospi_f64(x); }
+__DEVICE__
+inline
+double cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+__DEVICE__
+inline
+double cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+__DEVICE__
+inline
+double erf(double x) { return __ocml_erf_f64(x); }
+__DEVICE__
+inline
+double erfc(double x) { return __ocml_erfc_f64(x); }
+__DEVICE__
+inline
+double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
+__DEVICE__
+inline
+double erfcx(double x) { return __ocml_erfcx_f64(x); }
+__DEVICE__
+inline
+double erfinv(double x) { return __ocml_erfinv_f64(x); }
+__DEVICE__
+inline
+double exp(double x) { return __ocml_exp_f64(x); }
+__DEVICE__
+inline
+double exp10(double x) { return __ocml_exp10_f64(x); }
+__DEVICE__
+inline
+double exp2(double x) { return __ocml_exp2_f64(x); }
+__DEVICE__
+inline
+double expm1(double x) { return __ocml_expm1_f64(x); }
+__DEVICE__
+inline
+double fabs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline
+double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
+__DEVICE__
+inline
+double floor(double x) { return __ocml_floor_f64(x); }
+__DEVICE__
+inline
+double fma(double x, double y, double z) { return __ocml_fma_f64(x, y, z); }
+__DEVICE__
+inline
+double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
+__DEVICE__
+inline
+double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
+__DEVICE__
+inline
+double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
+__DEVICE__
+inline
+double frexp(double x, int* nptr)
+{
+    int tmp;
+    double r =
+        __ocml_frexp_f64(x, (__attribute__((address_space(5))) int*) &tmp);
+    *nptr = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
+__DEVICE__
+inline
+int ilogb(double x) { return __ocml_ilogb_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isfinite(double x) { return __ocml_isfinite_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isinf(double x) { return __ocml_isinf_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isnan(double x) { return __ocml_isnan_f64(x); }
+__DEVICE__
+inline
+double j0(double x) { return __ocml_j0_f64(x); }
+__DEVICE__
+inline
+double j1(double x) { return __ocml_j1_f64(x); }
+__DEVICE__
+inline
+double jn(int n, double x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case. Placeholder until OCML adds
+    //       support.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    double x0 = j0f(x);
+    double x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        double x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+__DEVICE__
+inline
+double ldexp(double x, int e) { return __ocml_ldexp_f64(x, e); }
+__DEVICE__
+inline
+double lgamma(double x) { return __ocml_lgamma_f64(x); }
+__DEVICE__
+inline
+long long int llrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+long long int llround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double log(double x) { return __ocml_log_f64(x); }
+__DEVICE__
+inline
+double log10(double x) { return __ocml_log10_f64(x); }
+__DEVICE__
+inline
+double log1p(double x) { return __ocml_log1p_f64(x); }
+__DEVICE__
+inline
+double log2(double x) { return __ocml_log2_f64(x); }
+__DEVICE__
+inline
+double logb(double x) { return __ocml_logb_f64(x); }
+__DEVICE__
+inline
+long int lrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+long int lround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double modf(double x, double* iptr)
+{
+    double tmp;
+    double r =
+        __ocml_modf_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+    *iptr = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+double nan(const char* tagp)
+{
+#if !_WIN32
+    union {
+        double val;
+        struct ieee_double {
+            uint64_t mantissa : 51;
+            uint32_t quiet : 1;
+            uint32_t exponent : 11;
+            uint32_t sign : 1;
+        }  bits;
+        static_assert(sizeof(double) == sizeof(ieee_double), "");
+    } tmp;
+
+    tmp.bits.sign = 0u;
+    tmp.bits.exponent = ~0u;
+    tmp.bits.quiet = 1u;
+    tmp.bits.mantissa = __make_mantissa(tagp);
+
+    return tmp.val;
+#else
+    static_assert(sizeof(uint64_t)==sizeof(double));
+    uint64_t val = __make_mantissa(tagp);
+    val |= 0xFFF << 51;
+    return *reinterpret_cast<double*>(&val);
+#endif
+}
+__DEVICE__
+inline
+double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
+__DEVICE__
+inline
+double nextafter(double x, double y) { return __ocml_nextafter_f64(x, y); }
+__DEVICE__
+inline
+double norm(int dim, const double* a)
+{   // TODO: placeholder until OCML adds support.
+    double r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_sqrt_f64(r);
+}
+__DEVICE__
+inline
+double norm3d(double x, double y, double z)
+{
+    return __ocml_len3_f64(x, y, z);
+}
+__DEVICE__
+inline
+double norm4d(double x, double y, double z, double w)
+{
+    return __ocml_len4_f64(x, y, z, w);
+}
+__DEVICE__
+inline
+double normcdf(double x) { return __ocml_ncdf_f64(x); }
+__DEVICE__
+inline
+double normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+__DEVICE__
+inline
+double pow(double x, double y) { return __ocml_pow_f64(x, y); }
+__DEVICE__
+inline
+double powi(double base, int iexp) { return __ocml_pown_f64(base, iexp); }
+__DEVICE__
+inline
+double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
+__DEVICE__
+inline
+double remainder(double x, double y) { return __ocml_remainder_f64(x, y); }
+__DEVICE__
+inline
+double remquo(double x, double y, int* quo)
+{
+    int tmp;
+    double r =
+        __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int*) &tmp);
+    *quo = tmp;
+
+    return r;
+}
+__DEVICE__
+inline
+double rhypot(double x, double y) { return __ocml_rhypot_f64(x, y); }
+__DEVICE__
+inline
+double rint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+double rnorm(int dim, const double* a)
+{   // TODO: placeholder until OCML adds support.
+    double r = 0;
+    while (dim--) { r += a[0] * a[0]; ++a; }
+
+    return __ocml_rsqrt_f64(r);
+}
+__DEVICE__
+inline
+double rnorm3d(double x, double y, double z)
+{
+    return __ocml_rlen3_f64(x, y, z);
+}
+__DEVICE__
+inline
+double rnorm4d(double x, double y, double z, double w)
+{
+    return __ocml_rlen4_f64(x, y, z, w);
+}
+__DEVICE__
+inline
+double round(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
+__DEVICE__
+inline
+double scalbln(double x, long int n)
+{
+    return (n < INT_MAX) ? __ocml_scalbn_f64(x, n) : __ocml_scalb_f64(x, n);
+}
+__DEVICE__
+inline
+double scalbn(double x, int n) { return __ocml_scalbn_f64(x, n); }
+__DEVICE__
+inline
+__RETURN_TYPE signbit(double x) { return __ocml_signbit_f64(x); }
+__DEVICE__
+inline
+double sin(double x) { return __ocml_sin_f64(x); }
+__DEVICE__
+inline
+void sincos(double x, double* sptr, double* cptr)
+{
+    double tmp;
+    *sptr =
+        __ocml_sincos_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+    *cptr = tmp;
+}
+__DEVICE__
+inline
+void sincospi(double x, double* sptr, double* cptr)
+{
+    double tmp;
+    *sptr = __ocml_sincospi_f64(
+        x, (__attribute__((address_space(5))) double*) &tmp);
+    *cptr = tmp;
+}
+__DEVICE__
+inline
+double sinh(double x) { return __ocml_sinh_f64(x); }
+__DEVICE__
+inline
+double sinpi(double x) { return __ocml_sinpi_f64(x); }
+__DEVICE__
+inline
+double sqrt(double x) { return __ocml_sqrt_f64(x); }
+__DEVICE__
+inline
+double tan(double x) { return __ocml_tan_f64(x); }
+__DEVICE__
+inline
+double tanh(double x) { return __ocml_tanh_f64(x); }
+__DEVICE__
+inline
+double tgamma(double x) { return __ocml_tgamma_f64(x); }
+__DEVICE__
+inline
+double trunc(double x) { return __ocml_trunc_f64(x); }
+__DEVICE__
+inline
+double y0(double x) { return __ocml_y0_f64(x); }
+__DEVICE__
+inline
+double y1(double x) { return __ocml_y1_f64(x); }
+__DEVICE__
+inline
+double yn(int n, double x)
+{   // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+    //       for linear recurrences to get O(log n) steps, but it's unclear if
+    //       it'd be beneficial in this case. Placeholder until OCML adds
+    //       support.
+    if (n == 0) return j0f(x);
+    if (n == 1) return j1f(x);
+
+    double x0 = j0f(x);
+    double x1 = j1f(x);
+    for (int i = 1; i < n; ++i) {
+        double x2 = (2 * i) / x * x1 - x0;
+        x0 = x1;
+        x1 = x2;
+    }
+
+    return x1;
+}
+
+// BEGIN INTRINSICS
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dadd_rn(double x, double y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __ddiv_rn(double x, double y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dmul_rn(double x, double y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#endif
+__DEVICE__
+inline
+double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline
+double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
+#endif
+__DEVICE__
+inline
+double __dsqrt_rn(double x) { return __ocml_sqrt_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
+__DEVICE__
+inline
+double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dsub_rn(double x, double y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __fma_rd(double x, double y, double z)
+{
+    return __ocml_fma_rtn_f64(x, y, z);
+}
+#endif
+__DEVICE__
+inline
+double __fma_rn(double x, double y, double z)
+{
+    return __ocml_fma_f64(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __fma_ru(double x, double y, double z)
+{
+    return __ocml_fma_rtp_f64(x, y, z);
+}
+__DEVICE__
+inline
+double __fma_rz(double x, double y, double z)
+{
+    return __ocml_fma_rtz_f64(x, y, z);
+}
+#endif
+// END INTRINSICS
+// END DOUBLE
+
+// BEGIN INTEGER
+__DEVICE__
+inline
+int abs(int x)
+{
+    int sgn = x >> (sizeof(int) * CHAR_BIT - 1);
+    return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline
+long labs(long x)
+{
+    long sgn = x >> (sizeof(long) * CHAR_BIT - 1);
+    return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline
+long long llabs(long long x)
+{
+    long long sgn = x >> (sizeof(long long) * CHAR_BIT - 1);
+    return (x ^ sgn) - sgn;
+}
+
+#if defined(__cplusplus)
+    __DEVICE__
+    inline
+    long abs(long x) { return labs(x); }
+    __DEVICE__
+    inline
+    long long abs(long long x) { return llabs(x); }
+#endif
+// END INTEGER
+
+__DEVICE__
+inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
+    return __ocml_fma_f16(x, y, z);
+}
+
+__DEVICE__
+inline float fma(float x, float y, float z) {
+    return fmaf(x, y, z);
+}
+
+#pragma push_macro("__DEF_FLOAT_FUN")
+#pragma push_macro("__DEF_FLOAT_FUN2")
+#pragma push_macro("__DEF_FLOAT_FUN2I")
+#pragma push_macro("__HIP_OVERLOAD")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template<bool __B, class __T = void>
+struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn)                                         \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<std::numeric_limits<__T>::is_integer,           \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __hip_enable_if<                                         \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+// Define cmath functions with float argument and returns float.
+#define __DEF_FUN1(retty, func) \
+__DEVICE__ \
+inline \
+float func(float x) \
+{ \
+  return func##f(x); \
+} \
+__HIP_OVERLOAD1(retty, func)
+
+// Define cmath functions with float argument and returns retty.
+#define __DEF_FUNI(retty, func) \
+__DEVICE__ \
+inline \
+retty func(float x) \
+{ \
+  return func##f(x); \
+} \
+__HIP_OVERLOAD1(retty, func)
+
+// define cmath functions with two float arguments.
+#define __DEF_FUN2(retty, func) \
+__DEVICE__ \
+inline \
+float func(float x, float y) \
+{ \
+  return func##f(x, y); \
+} \
+__HIP_OVERLOAD2(retty, func)
+
+__DEF_FUN1(double, acos)
+__DEF_FUN1(double, acosh)
+__DEF_FUN1(double, asin)
+__DEF_FUN1(double, asinh)
+__DEF_FUN1(double, atan)
+__DEF_FUN2(double, atan2);
+__DEF_FUN1(double, atanh)
+__DEF_FUN1(double, cbrt)
+__DEF_FUN1(double, ceil)
+__DEF_FUN2(double, copysign);
+__DEF_FUN1(double, cos)
+__DEF_FUN1(double, cosh)
+__DEF_FUN1(double, erf)
+__DEF_FUN1(double, erfc)
+__DEF_FUN1(double, exp)
+__DEF_FUN1(double, exp2)
+__DEF_FUN1(double, expm1)
+__DEF_FUN1(double, fabs)
+__DEF_FUN2(double, fdim);
+__DEF_FUN1(double, floor)
+__DEF_FUN2(double, fmax);
+__DEF_FUN2(double, fmin);
+__DEF_FUN2(double, fmod);
+//__HIP_OVERLOAD1(int, fpclassify)
+__DEF_FUN2(double, hypot);
+__DEF_FUNI(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater);
+__HIP_OVERLOAD2(bool, isgreaterequal);
+__HIP_OVERLOAD1(bool, isinf);
+__HIP_OVERLOAD2(bool, isless);
+__HIP_OVERLOAD2(bool, islessequal);
+__HIP_OVERLOAD2(bool, islessgreater);
+__HIP_OVERLOAD1(bool, isnan);
+//__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered);
+__DEF_FUN1(double, lgamma)
+__DEF_FUN1(double, log)
+__DEF_FUN1(double, log10)
+__DEF_FUN1(double, log1p)
+__DEF_FUN1(double, log2)
+__DEF_FUN1(double, logb)
+__DEF_FUNI(long long, llrint)
+__DEF_FUNI(long long, llround)
+__DEF_FUNI(long, lrint)
+__DEF_FUNI(long, lround)
+__DEF_FUN1(double, nearbyint);
+__DEF_FUN2(double, nextafter);
+__DEF_FUN2(double, pow);
+__DEF_FUN2(double, remainder);
+__DEF_FUN1(double, rint);
+__DEF_FUN1(double, round);
+__HIP_OVERLOAD1(bool, signbit)
+__DEF_FUN1(double, sin)
+__DEF_FUN1(double, sinh)
+__DEF_FUN1(double, sqrt)
+__DEF_FUN1(double, tan)
+__DEF_FUN1(double, tanh)
+__DEF_FUN1(double, tgamma)
+__DEF_FUN1(double, trunc);
+
+// define cmath functions with a float and an integer argument.
+#define __DEF_FLOAT_FUN2I(func) \
+__DEVICE__ \
+inline \
+float func(float x, int y) \
+{ \
+  return func##f(x, y); \
+}
+__DEF_FLOAT_FUN2I(scalbn)
+__DEF_FLOAT_FUN2I(ldexp)
+
+template<class T>
+__DEVICE__ inline T min(T arg1, T arg2) {
+  return (arg1 < arg2) ? arg1 : arg2;
+}
+
+template<class T>
+__DEVICE__ inline T max(T arg1, T arg2) {
+  return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__ inline int min(int arg1, int arg2) {
+  return (arg1 < arg2) ? arg1 : arg2;
+}
+__DEVICE__ inline int max(int arg1, int arg2) {
+  return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__ inline int min(uint32_t arg1, int arg2) {
+  return (arg1 < arg2) ? arg1 : arg2;
+}
+__DEVICE__ inline int max(uint32_t arg1, int arg2) {
+  return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__
+inline
+float max(float x, float y) {
+  return fmaxf(x, y);
+}
+
+__DEVICE__
+inline
+double max(double x, double y) {
+  return fmax(x, y);
+}
+
+__DEVICE__
+inline
+float min(float x, float y) {
+  return fminf(x, y);
+}
+
+__DEVICE__
+inline
+double min(double x, double y) {
+  return fmin(x, y);
+}
+
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+#if !defined(__HIPCC_RTC__)
+__host__ inline static int min(int arg1, int arg2) {
+  return std::min(arg1, arg2);
+}
+
+__host__ inline static int max(int arg1, int arg2) {
+  return std::max(arg1, arg2);
+}
+#endif // !defined(__HIPCC_RTC__)
+
+__DEVICE__
+inline float pow(float base, int iexp) {
+  return powif(base, iexp);
+}
+
+__DEVICE__
+inline double pow(double base, int iexp) {
+  return powi(base, iexp);
+}
+
+__DEVICE__
+inline _Float16 pow(_Float16 base, int iexp) {
+  return __ocml_pown_f16(base, iexp);
+}
+
+#pragma pop_macro("__DEF_FLOAT_FUN")
+#pragma pop_macro("__DEF_FLOAT_FUN2")
+#pragma pop_macro("__DEF_FLOAT_FUN2I")
+#pragma pop_macro("__HIP_OVERLOAD")
+#pragma pop_macro("__HIP_OVERLOAD2")
+
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__RETURN_TYPE")
+
+// For backward compatibility.
+// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
+// defined after including math_functions.h.
+#include <hip/amd_detail/hip_runtime.h>
diff --git a/include/hip/amd_detail/math_fwd.h b/include/hip/amd_detail/math_fwd.h
new file mode 100644
index 0000000000..ac46d537a8
--- /dev/null
+++ b/include/hip/amd_detail/math_fwd.h
@@ -0,0 +1,714 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "host_defines.h"
+#if defined(__cplusplus)
+    extern "C" {
+#endif
+
+// DOT FUNCTIONS
+#if __HIP_CLANG_ONLY__
+__device__
+__attribute__((const))
+int __ockl_sdot2(
+    HIP_vector_base<short, 2>::Native_vec_,
+    HIP_vector_base<short, 2>::Native_vec_,
+    int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot2(
+    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot4(
+    HIP_vector_base<char, 4>::Native_vec_,
+    HIP_vector_base<char, 4>::Native_vec_,
+    int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot4(
+    HIP_vector_base<unsigned char, 4>::Native_vec_,
+    HIP_vector_base<unsigned char, 4>::Native_vec_,
+    unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot8(int, int, int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__device__
+__attribute__((const))
+float __ocml_acos_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_acosh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_asin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_asinh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_atan2_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_atan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_atanh_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_cbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ceil_f32(float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_copysign_f32(float, float);
+__device__
+float __ocml_cos_f32(float);
+__device__
+float __ocml_native_cos_f32(float);
+__device__
+__attribute__((pure))
+__device__
+float __ocml_cosh_f32(float);
+__device__
+float __ocml_cospi_f32(float);
+__device__
+float __ocml_i0_f32(float);
+__device__
+float __ocml_i1_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfc_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcx_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_expm1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fabs_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fdim_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_floor_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fmax_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_fmin_f32(float, float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_fmod_f32(float, float);
+__device__
+float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_hypot_f32(float, float);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isinf_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isnan_f32(float);
+__device__
+float __ocml_j0_f32(float);
+__device__
+float __ocml_j1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ldexp_f32(float, int);
+__device__
+float __ocml_lgamma_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log1p_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log2_f32(float);
+__device__
+__attribute__((const))
+float __ocml_logb_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log_f32(float);
+__device__
+float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__device__
+__attribute__((const))
+float __ocml_nearbyint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_nextafter_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_len3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_len4_f32(float, float, float, float);
+__device__
+__attribute__((pure))
+float __ocml_ncdf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_ncdfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_pow_f32(float, float);
+__device__
+__attribute__((pure))
+float __ocml_pown_f32(float, int);
+__device__
+__attribute__((pure))
+float __ocml_rcbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_remainder_f32(float, float);
+__device__
+float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_rhypot_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_rint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_rlen3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_rlen4_f32(float, float, float, float);
+__device__
+__attribute__((const))
+float __ocml_round_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_rsqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_scalb_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_scalbn_f32(float, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f32(float);
+__device__
+float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sin_f32(float);
+__device__
+float __ocml_native_sin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_sinh_f32(float);
+__device__
+float __ocml_sinpi_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_native_sqrt_f32(float);
+__device__
+float __ocml_tan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_tanh_f32(float);
+__device__
+float __ocml_tgamma_f32(float);
+__device__
+__attribute__((const))
+float __ocml_trunc_f32(float);
+__device__
+float __ocml_y0_f32(float);
+__device__
+float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+float __ocml_add_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rte_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtn_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtp_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtz_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_rte_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtn_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtp_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtz_f32(float, float, float);
+
+__device__
+__attribute__((const))
+float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__
+__attribute__((const))
+double __ocml_acos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_acosh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_asin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_asinh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_atan2_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_atan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_atanh_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ceil_f64(double);
+__device__
+__attribute__((const))
+double __ocml_copysign_f64(double, double);
+__device__
+double __ocml_cos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cosh_f64(double);
+__device__
+double __ocml_cospi_f64(double);
+__device__
+double __ocml_i0_f64(double);
+__device__
+double __ocml_i1_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfc_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcx_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp2_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_expm1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fabs_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fdim_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_floor_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fmax_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmin_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmod_f64(double, double);
+__device__
+double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_hypot_f64(double, double);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isinf_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isnan_f64(double);
+__device__
+double __ocml_j0_f64(double);
+__device__
+double __ocml_j1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ldexp_f64(double, int);
+__device__
+double __ocml_lgamma_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log1p_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log2_f64(double);
+__device__
+__attribute__((const))
+double __ocml_logb_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log_f64(double);
+__device__
+double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__device__
+__attribute__((const))
+double __ocml_nearbyint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_nextafter_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_len3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_len4_f64(double, double, double, double);
+__device__
+__attribute__((pure))
+double __ocml_ncdf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_ncdfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_pow_f64(double, double);
+__device__
+__attribute__((pure))
+double __ocml_pown_f64(double, int);
+__device__
+__attribute__((pure))
+double __ocml_rcbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_remainder_f64(double, double);
+__device__
+double __ocml_remquo_f64(
+    double, double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_rhypot_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_rint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_rlen3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_rlen4_f64(double, double, double, double);
+__device__
+__attribute__((const))
+double __ocml_round_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_rsqrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_scalb_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_scalbn_f64(double, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f64(double);
+__device__
+double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_sinh_f64(double);
+__device__
+double __ocml_sinpi_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_f64(double);
+__device__
+double __ocml_tan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_tanh_f64(double);
+__device__
+double __ocml_tgamma_f64(double);
+__device__
+__attribute__((const))
+double __ocml_trunc_f64(double);
+__device__
+double __ocml_y0_f64(double);
+__device__
+double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+double __ocml_add_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rte_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtn_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtp_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtz_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_rte_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtn_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtp_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtz_f64(double, double, double);
+
+__device__
+__attribute__((const))
+double __llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
+__device__
+__attribute__((const))
+double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+// END INTRINSICS
+// END DOUBLE
+
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if defined(__cplusplus)
+    } // extern "C"
+#endif
diff --git a/include/hip/amd_detail/ockl_image.h b/include/hip/amd_detail/ockl_image.h
new file mode 100644
index 0000000000..b32b23fda0
--- /dev/null
+++ b/include/hip/amd_detail/ockl_image.h
@@ -0,0 +1,135 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_vector_types.h>
+
+extern "C" {
+
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+
+__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
+
+__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+};
\ No newline at end of file
diff --git a/include/hip/amd_detail/program_state.hpp b/include/hip/amd_detail/program_state.hpp
new file mode 100644
index 0000000000..6128a4c158
--- /dev/null
+++ b/include/hip/amd_detail/program_state.hpp
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hsa/amd_hsa_kernel_code.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#include <hsa/hsa_ven_amd_loader.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include <hip/hip_common.h>
+
+struct ihipModuleSymbol_t;
+using hipFunction_t = ihipModuleSymbol_t*;
+
+namespace hip_impl {
+
+// This section contains internal APIs that
+// needs to be exported
+#ifdef __GNUC__
+#pragma GCC visibility push (default)
+#endif
+
+struct kernarg_impl;
+class kernarg {
+public:
+    kernarg();
+    kernarg(kernarg&&);
+    ~kernarg();
+    std::uint8_t* data();
+    std::size_t   size();
+    void reserve(std::size_t);
+    void resize(std::size_t);
+private:
+    kernarg_impl* impl;
+};
+
+class kernargs_size_align;
+class program_state_impl;
+class program_state {
+public:
+    program_state();
+    ~program_state();
+    program_state(const program_state&) = delete;
+
+    hipFunction_t kernel_descriptor(std::uintptr_t,
+                                    hsa_agent_t);
+
+    kernargs_size_align get_kernargs_size_align(std::uintptr_t);
+    hsa_executable_t load_executable(const char*, const size_t,
+                                     hsa_executable_t,
+                                     hsa_agent_t);
+    hsa_executable_t load_executable_no_copy(const char*, const size_t,
+                                             hsa_executable_t,
+                                             hsa_agent_t);
+
+    void* global_addr_by_name(const char* name);
+
+private:
+    friend class agent_globals_impl;
+    program_state_impl* impl;
+};
+
+class kernargs_size_align {
+public:
+    std::size_t size(std::size_t n) const;
+    std::size_t alignment(std::size_t n) const;
+    const void* getHandle() const {return handle;};
+private:
+    const void* handle;
+    friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
+};
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+inline
+__attribute__((visibility("hidden")))
+program_state& get_program_state() {
+    static program_state ps;
+    return ps;
+}
+}  // Namespace hip_impl.
diff --git a/include/hip/amd_detail/surface_functions.h b/include/hip/amd_detail/surface_functions.h
new file mode 100644
index 0000000000..51c32bf85d
--- /dev/null
+++ b/include/hip/amd_detail/surface_functions.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+
+#include <hip/amd_detail/hip_surface_types.h>
+
+#define __SURFACE_FUNCTIONS_DECL__ static inline __device__
+template <class T>
+__SURFACE_FUNCTIONS_DECL__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
+                                           int boundaryMode = hipBoundaryModeZero) {
+    hipArray* arrayPtr = (hipArray*)surfObj;
+    size_t width = arrayPtr->width;
+    size_t height = arrayPtr->height;
+    int32_t xOffset = x / sizeof(T);
+    T* dataPtr = (T*)arrayPtr->data;
+    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
+        if (boundaryMode == hipBoundaryModeZero) {
+            *data = 0;
+        }
+    } else {
+        *data = *(dataPtr + y * width + xOffset);
+    }
+}
+
+template <class T>
+__SURFACE_FUNCTIONS_DECL__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
+                                            int boundaryMode = hipBoundaryModeZero) {
+    hipArray* arrayPtr = (hipArray*)surfObj;
+    size_t width = arrayPtr->width;
+    size_t height = arrayPtr->height;
+    int32_t xOffset = x / sizeof(T);
+    T* dataPtr = (T*)arrayPtr->data;
+    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
+        *(dataPtr + y * width + xOffset) = data;
+    }
+}
+
+#endif
diff --git a/include/hip/amd_detail/texture_fetch_functions.h b/include/hip/amd_detail/texture_fetch_functions.h
new file mode 100644
index 0000000000..399e4fecf7
--- /dev/null
+++ b/include/hip/amd_detail/texture_fetch_functions.h
@@ -0,0 +1,388 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/ockl_image.h>
+
+#if !defined(__HIPCC_RTC__)
+#include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_PARAMETERS_INIT                                                                     \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_tex_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value ||
+        std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value ||
+        std::is_same<T, float>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_tex_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_tex_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template<typename T>
+struct __hip_is_tex_normalized_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_tex_normalized_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode,
+    typename Enable = void>
+struct __hip_tex_ret
+{
+    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode>
+using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex_ret<
+    T,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+    using type = T;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
+};
+
+template<typename T>
+struct __hip_tex_ret<
+    T,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+    using type = float;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_load_1Db(i, x);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_1D(i, s, x);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+    return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+    return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <
+    typename T,
+    hipTextureReadMode readMode,
+    typename Enable = void>
+struct __hip_tex2dgather_ret
+{
+    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode>
+using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+    T,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+    using type = HIP_vector_type<T, 4>;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex2dgather_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<T, 4>;
+};
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+    T,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+    using type = float4;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
+{
+    TEXTURE_PARAMETERS_INIT;
+    switch (comp) {
+    case 1: {
+        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+    }
+    case 2: {
+        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+    }
+    case 3: {
+        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+    }
+    default: {
+        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+    }
+    }
+    return {};
+}
+
+#endif
diff --git a/include/hip/amd_detail/texture_functions.h b/include/hip/amd_detail/texture_functions.h
new file mode 100644
index 0000000000..5da388ce3c
--- /dev/null
+++ b/include/hip/amd_detail/texture_functions.h
@@ -0,0 +1,11102 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_TEXTURE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_TEXTURE_FUNCTIONS_H
+#include <hip/amd_detail/hip_vector_types.h>
+#include <hip/amd_detail/hip_texture_types.h>
+
+#pragma push_macro("TYPEDEF_VECTOR_VALUE_TYPE")
+#define TYPEDEF_VECTOR_VALUE_TYPE(SCALAR_TYPE) \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##2_vector_value_type __attribute__((ext_vector_type(2))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##3_vector_value_type __attribute__((ext_vector_type(3))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##4_vector_value_type __attribute__((ext_vector_type(4))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##8_vector_value_type __attribute__((ext_vector_type(8))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##16_vector_value_type __attribute__((ext_vector_type(16)));
+
+TYPEDEF_VECTOR_VALUE_TYPE(float);
+TYPEDEF_VECTOR_VALUE_TYPE(int);
+TYPEDEF_VECTOR_VALUE_TYPE(uint);
+
+#undef TYPEDEF_VECTOR_VALUE_TYPE
+#pragma pop_macro("TYPEDEF_VECTOR_VALUE_TYPE")
+
+union TData {
+    __hip_float4_vector_value_type f;
+    __hip_int4_vector_value_type i;
+    __hip_uint4_vector_value_type u;
+};
+
+#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__
+
+
+#if __clang__
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+#else
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(2)))
+#endif
+
+#define TEXTURE_PARAMETERS_INIT                                                                    \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject;  \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;                  \
+    TData texel;
+#define TEXTURE_REF_PARAMETERS_INIT                                                                      \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)texRef.textureObject; \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;                        \
+    TData texel;
+#define TEXTURE_SET_FLOAT *retVal = texel.f.x;
+
+#define TEXTURE_SET_SIGNED *retVal = texel.i.x;
+
+#define TEXTURE_SET_UNSIGNED *retVal = texel.u.x;
+
+#define TEXTURE_SET_FLOAT_X retVal->x = texel.f.x;
+
+#define TEXTURE_SET_SIGNED_X retVal->x = texel.i.x;
+
+#define TEXTURE_SET_UNSIGNED_X retVal->x = texel.u.x;
+
+#define TEXTURE_SET_FLOAT_XY                                                                       \
+    retVal->x = texel.f.x;                                                                         \
+    retVal->y = texel.f.y;
+
+#define TEXTURE_SET_SIGNED_XY                                                                      \
+    retVal->x = texel.i.x;                                                                         \
+    retVal->y = texel.i.y;
+
+#define TEXTURE_SET_UNSIGNED_XY                                                                    \
+    retVal->x = texel.u.x;                                                                         \
+    retVal->y = texel.u.y;
+
+#define TEXTURE_SET_FLOAT_XYZW                                                                     \
+    retVal->x = texel.f.x;                                                                         \
+    retVal->y = texel.f.y;                                                                         \
+    retVal->z = texel.f.z;                                                                         \
+    retVal->w = texel.f.w;
+
+#define TEXTURE_SET_SIGNED_XYZW                                                                    \
+    retVal->x = texel.i.x;                                                                         \
+    retVal->y = texel.i.y;                                                                         \
+    retVal->z = texel.i.z;                                                                         \
+    retVal->w = texel.i.w;
+
+#define TEXTURE_SET_UNSIGNED_XYZW                                                                  \
+    retVal->x = texel.u.x;                                                                         \
+    retVal->y = texel.u.y;                                                                         \
+    retVal->z = texel.u.z;                                                                         \
+    retVal->w = texel.u.w;
+
+#define TEXTURE_RETURN_CHAR return texel.i.x;
+
+#define TEXTURE_RETURN_UCHAR return texel.u.x;
+
+#define TEXTURE_RETURN_SHORT return texel.i.x;
+
+#define TEXTURE_RETURN_USHORT return texel.u.x;
+
+#define TEXTURE_RETURN_INT return texel.i.x;
+
+#define TEXTURE_RETURN_UINT return texel.u.x;
+
+#define TEXTURE_RETURN_SIGNED return texel.i.x;
+
+#define TEXTURE_RETURN_UNSIGNED return texel.u.x;
+
+#define TEXTURE_RETURN_CHAR_X return make_char1(texel.i.x);
+
+#define TEXTURE_RETURN_UCHAR_X return make_uchar1(texel.u.x);
+
+#define TEXTURE_RETURN_SHORT_X return make_short1(texel.i.x);
+
+#define TEXTURE_RETURN_USHORT_X return make_ushort1(texel.u.x);
+
+#define TEXTURE_RETURN_INT_X return make_int1(texel.i.x);
+
+#define TEXTURE_RETURN_UINT_X return make_uint1(texel.u.x);
+
+#define TEXTURE_RETURN_CHAR_XY return make_char2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_UCHAR_XY return make_uchar2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_SHORT_XY return make_short2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_USHORT_XY return make_ushort2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_INT_XY return make_int2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_UINT_XY return make_uint2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_CHAR_XYZW return make_char4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_UCHAR_XYZW return make_uchar4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_SHORT_XYZW return make_short4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_USHORT_XYZW return make_ushort4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_INT_XYZW return make_int4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_UINT_XYZW return make_uint4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_FLOAT return texel.f.x;
+
+#define TEXTURE_RETURN_FLOAT_X return make_float1(texel.f.x);
+
+#define TEXTURE_RETURN_FLOAT_XY return make_float2(texel.f.x, texel.f.y);
+
+#define TEXTURE_RETURN_FLOAT_XYZW return make_float4(texel.f.x, texel.f.y, texel.f.z, texel.f.w);
+
+extern "C" {
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_1D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    float c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_1Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_2D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c);
+
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_2Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c);
+
+__device__
+float __ockl_image_sample_2Dad(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c);
+
+__device__
+float __ockl_image_sample_2Dd(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_3D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_1D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    float c, float dx, float dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_1Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, float dx, float dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_2D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_2Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+float __ockl_image_sample_grad_2Dad(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+float __ockl_image_sample_grad_2Dd(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_3D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, __hip_float4_vector_value_type dx, __hip_float4_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_1D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    float c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_1Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_2D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_2Da(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, float l);
+
+__device__
+float __ockl_image_sample_lod_2Dad(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, float l);
+
+__device__
+float __ockl_image_sample_lod_2Dd(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_3D(
+    unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+    __hip_float4_vector_value_type c, float l);
+}
+
+////////////////////////////////////////////////////////////
+// Texture object APIs
+////////////////////////////////////////////////////////////
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char1* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char2* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char4* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned char* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar1* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar2* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar4* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short1* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short2* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short4* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned short* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort1* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort2* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort4* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int1* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int2* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int4* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned int* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint1* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint2* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint4* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float* retVal, hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float1* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float2* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float4* retVal, hipTextureObject_t textureObject,
+                                           int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
+    T ret;
+    tex1Dfetch(&ret, textureObject, x);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned char* retVal, hipTextureObject_t textureObject,
+                                      float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned short* retVal, hipTextureObject_t textureObject,
+                                      float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned int* retVal, hipTextureObject_t textureObject,
+                                      float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float1* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float2* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float4* retVal, hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1D(hipTextureObject_t textureObject, float x) {
+    T ret;
+    tex1D(&ret, textureObject, x);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+                                         float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+                                         float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+                                         float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLod(hipTextureObject_t textureObject, float x, float level) {
+    T ret;
+    tex1DLod(&ret, textureObject, x, level);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned char* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned short* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort1* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort2* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort4* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned int* retVal, hipTextureObject_t textureObject,
+                                          float x, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float1* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float2* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float4* retVal, hipTextureObject_t textureObject, float x,
+                                          float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dx,
+                                       float dy) {
+    T ret;
+    tex1DLod(&ret, textureObject, x, dx, dy);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned char* retVal, hipTextureObject_t textureObject,
+                                      float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned short* retVal, hipTextureObject_t textureObject,
+                                      float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned int* retVal, hipTextureObject_t textureObject,
+                                      float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
+    T ret;
+    tex2D(&ret, textureObject, x, y);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
+                                      float level) {
+    T ret;
+    tex2DLod(&ret, textureObject, x, y, level);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned char* retVal, hipTextureObject_t textureObject,
+                                      float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned short* retVal, hipTextureObject_t textureObject,
+                                      float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned int* retVal, hipTextureObject_t textureObject,
+                                      float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float1* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float2* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float4* retVal, hipTextureObject_t textureObject, float x,
+                                      float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z) {
+    T ret;
+    tex3D(&ret, textureObject, x, y, z);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+                                         float x, float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+                                         float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z,
+                                      float level) {
+    T ret;
+    tex3DLod(&ret, textureObject, x, y, z, level);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned char* retVal,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned short* retVal,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int* retVal, hipTextureObject_t textureObject, float x,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned int* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float1* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float2* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_FLOAT_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float4* retVal, hipTextureObject_t textureObject,
+                                             float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer) {
+    T ret;
+    tex1DLayered(&ret, textureObject, x, layer);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned char* retVal,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned short* retVal,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned int* retVal,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float1* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float2* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float4* retVal, hipTextureObject_t textureObject,
+                                                float x, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer,
+                                             float level) {
+    T ret;
+    tex1DLayeredLod(&ret, textureObject, x, layer, level);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned char* retVal,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned short* retVal,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned int* retVal,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float1* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float2* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float4* retVal, hipTextureObject_t textureObject,
+                                                 float x, int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer,
+                                              float dx, float dy) {
+    T ret;
+    tex1DLayeredGrad(&ret, textureObject, x, layer, dx, dy);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned char* retVal,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned short* retVal,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int* retVal, hipTextureObject_t textureObject, float x,
+                                             float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned int* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float1* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float2* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float4* retVal, hipTextureObject_t textureObject,
+                                             float x, float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
+                                          int layer) {
+    T ret;
+    tex2DLayered(&ret, textureObject, x, y, layer);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned char* retVal,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned short* retVal,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned int* retVal,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float1* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float2* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float4* retVal, hipTextureObject_t textureObject,
+                                                float x, float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y,
+                                             int layer, float level) {
+    T ret;
+    tex2DLayeredLod(&ret, textureObject, x, y, layer, level);
+    return ret;
+}
+
+////////////////////////////////////////////////////////////
+// Texture Reference APIs
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1Dfetch(texture<char, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1Dfetch(texture<char1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1Dfetch(texture<char2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1Dfetch(texture<char4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1Dfetch(texture<unsigned char, texType, mode> texRef,
+                                                    int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1Dfetch(texture<uchar1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1Dfetch(texture<uchar2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1Dfetch(texture<uchar4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1Dfetch(texture<short, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1Dfetch(texture<short1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1Dfetch(texture<short2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1Dfetch(texture<short4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1Dfetch(texture<ushort1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1Dfetch(texture<unsigned short, texType, mode> texRef,
+                                                     int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1Dfetch(texture<ushort2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1Dfetch(texture<ushort4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1Dfetch(texture<int1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1Dfetch(texture<int, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1Dfetch(texture<int2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1Dfetch(texture<int4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1Dfetch(texture<unsigned int, texType, mode> texRef,
+                                                   int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1Dfetch(texture<uint1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1Dfetch(texture<uint2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1Dfetch(texture<uint4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1Dfetch(texture<float, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1Dfetch(texture<float1, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1Dfetch(texture<float2, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1Dfetch(texture<float4, texType, mode> texRef, int x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1Dfetch(texture<char, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1Dfetch(texture<char1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1Dfetch(texture<char2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1Dfetch(texture<char4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1Dfetch(texture<unsigned char, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1Dfetch(texture<uchar1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1Dfetch(texture<uchar2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1Dfetch(texture<uchar4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1Dfetch(texture<short, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1Dfetch(texture<short1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1Dfetch(texture<short2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1Dfetch(texture<short4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1Dfetch(texture<ushort1, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1Dfetch(texture<unsigned short, texType, mode> texRef,
+                                                     hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1Dfetch(texture<ushort2, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1Dfetch(texture<ushort4, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1Dfetch(texture<int1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1Dfetch(texture<int, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1Dfetch(texture<int2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1Dfetch(texture<int4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1Dfetch(texture<unsigned int, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1Dfetch(texture<uint1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1Dfetch(texture<uint2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1Dfetch(texture<uint4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1Dfetch(texture<float, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1Dfetch(texture<float1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1Dfetch(texture<float2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1Dfetch(texture<float4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, int x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1D(texture<char, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1D(texture<char1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1D(texture<char2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1D(texture<char4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1D(texture<unsigned char, texType, mode> texRef,
+                                               float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1D(texture<uchar1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1D(texture<uchar2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1D(texture<uchar4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1D(texture<short, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1D(texture<short1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1D(texture<short2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1D(texture<short4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1D(texture<unsigned short, texType, mode> texRef,
+                                                float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1D(texture<ushort1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1D(texture<ushort2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1D(texture<ushort4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1D(texture<int, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1D(texture<int1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1D(texture<int2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1D(texture<int4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1D(texture<unsigned int, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1D(texture<uint1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1D(texture<uint2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1D(texture<uint4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1D(texture<float1, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1D(texture<float2, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1D(texture<float4, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1D(texture<char, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1D(texture<char1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1D(texture<char2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1D(texture<char4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1D(texture<unsigned char, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1D(texture<uchar1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1D(texture<uchar2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1D(texture<uchar4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1D(texture<short, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1D(texture<short1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1D(texture<short2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1D(texture<short4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1D(texture<unsigned short, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1D(texture<ushort1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1D(texture<ushort2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1D(texture<ushort4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1D(texture<int, texType, mode> texRef,
+                                     hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1D(texture<int1, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1D(texture<int2, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1D(texture<int4, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1D(texture<unsigned int, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1D(texture<uint1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1D(texture<uint2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1D(texture<uint4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1D(texture<float, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT;
+}
+//////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1D(texture<float, texType, mode> texRef, float x) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1D(texture<float1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1D(texture<float2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1D(texture<float4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1D(i, s, x);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLod(texture<char, texType, mode> texRef, float x,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLod(texture<char1, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLod(texture<char2, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLod(texture<char4, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLod(texture<unsigned char, texType, mode> texRef,
+                                                  float x, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLod(texture<uchar1, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLod(texture<uchar2, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLod(texture<uchar4, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLod(texture<short, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLod(texture<short1, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLod(texture<short2, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLod(texture<short4, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLod(texture<unsigned short, texType, mode> texRef,
+                                                   float x, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLod(texture<ushort1, texType, mode> texRef, float x,
+                                            float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLod(texture<ushort2, texType, mode> texRef, float x,
+                                            float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLod(texture<ushort4, texType, mode> texRef, float x,
+                                            float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLod(texture<int, texType, mode> texRef, float x, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLod(texture<int1, texType, mode> texRef, float x,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLod(texture<int2, texType, mode> texRef, float x,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLod(texture<int4, texType, mode> texRef, float x,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLod(texture<unsigned int, texType, mode> texRef,
+                                                 float x, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLod(texture<uint1, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLod(texture<uint2, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLod(texture<uint4, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLod(texture<float, texType, mode> texRef, float x,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLod(texture<float1, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLod(texture<float2, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLod(texture<float4, texType, mode> texRef, float x,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLod(texture<char, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLod(texture<char1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLod(texture<char2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLod(texture<char4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLod(texture<unsigned char, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLod(texture<uchar1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLod(texture<uchar2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLod(texture<uchar4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLod(texture<short, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLod(texture<short1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLod(texture<short2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLod(texture<short4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLod(texture<unsigned short, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLod(texture<ushort1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLod(texture<ushort2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLod(texture<ushort4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLod(texture<int, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLod(texture<int1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLod(texture<int2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLod(texture<int4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLod(texture<unsigned int, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLod(texture<uint1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLod(texture<uint2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLod(texture<uint4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLod(texture<float, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLod(texture<float1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLod(texture<float2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLod(texture<float4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DGrad(texture<char, texType, mode> texRef, float x, float dx,
+                                          float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DGrad(texture<char1, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DGrad(texture<char2, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DGrad(texture<char4, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   float x, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DGrad(texture<uchar1, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DGrad(texture<uchar2, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DGrad(texture<uchar4, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DGrad(texture<short, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DGrad(texture<short1, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DGrad(texture<short2, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DGrad(texture<short4, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    float x, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DGrad(texture<ushort1, texType, mode> texRef, float x,
+                                             float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DGrad(texture<ushort2, texType, mode> texRef, float x,
+                                             float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DGrad(texture<ushort4, texType, mode> texRef, float x,
+                                             float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DGrad(texture<int, texType, mode> texRef, float x, float dx,
+                                         float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DGrad(texture<int1, texType, mode> texRef, float x, float dx,
+                                          float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DGrad(texture<int2, texType, mode> texRef, float x, float dx,
+                                          float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DGrad(texture<int4, texType, mode> texRef, float x, float dx,
+                                          float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  float x, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DGrad(texture<uint1, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DGrad(texture<uint2, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DGrad(texture<uint4, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DGrad(texture<float, texType, mode> texRef, float x, float dx,
+                                           float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DGrad(texture<float1, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DGrad(texture<float2, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DGrad(texture<float4, texType, mode> texRef, float x,
+                                            float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DGrad(texture<char, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float dx,
+                                          float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DGrad(texture<char1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DGrad(texture<char2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DGrad(texture<char4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DGrad(texture<uchar1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DGrad(texture<uchar2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DGrad(texture<uchar4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DGrad(texture<short, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DGrad(texture<short1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DGrad(texture<short2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DGrad(texture<short4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DGrad(texture<ushort1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float dx,
+                                             float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DGrad(texture<ushort2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float dx,
+                                             float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DGrad(texture<ushort4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float dx,
+                                             float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DGrad(texture<int, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float dx,
+                                         float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DGrad(texture<int1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float dx,
+                                          float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DGrad(texture<int2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float dx,
+                                          float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DGrad(texture<int4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float dx,
+                                          float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DGrad(texture<uint1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DGrad(texture<uint2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DGrad(texture<uint4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DGrad(texture<float, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float dx,
+                                           float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DGrad(texture<float1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DGrad(texture<float2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DGrad(texture<float4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float dx,
+                                            float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2D(texture<char, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2D(texture<char1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2D(texture<char2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2D(texture<char4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2D(texture<unsigned char, texType, mode> texRef,
+                                               float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2D(texture<uchar1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2D(texture<uchar2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2D(texture<uchar4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2D(texture<short, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2D(texture<short1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2D(texture<short2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2D(texture<short4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2D(texture<unsigned short, texType, mode> texRef,
+                                                float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2D(texture<ushort1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2D(texture<ushort2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2D(texture<ushort4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2D(texture<int, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2D(texture<int1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2D(texture<int2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2D(texture<int4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2D(texture<unsigned int, texType, mode> texRef, float x,
+                                              float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2D(texture<uint1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2D(texture<uint2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2D(texture<uint4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2D(texture<char, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2D(texture<char1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2D(texture<char2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2D(texture<char4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2D(texture<unsigned char, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2D(texture<uchar1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2D(texture<uchar2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2D(texture<uchar4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2D(texture<short, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2D(texture<short1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2D(texture<short2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2D(texture<short4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2D(texture<unsigned short, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2D(texture<ushort1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2D(texture<ushort2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2D(texture<ushort4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2D(texture<int, texType, mode> texRef,
+                                     hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2D(texture<int1, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2D(texture<int2, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2D(texture<int4, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2D(texture<unsigned int, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2D(texture<uint1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2D(texture<uint2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2D(texture<uint4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2D(texture<float, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2D(texture<float, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2D(texture<float1, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2D(texture<float1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2D(texture<float2, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2D(texture<float2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2D(texture<float4, texType, mode> texRef, float x, float y) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2D(texture<float4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLod(texture<char, texType, mode> texRef, float x, float y,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLod(texture<char1, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLod(texture<char2, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLod(texture<char4, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLod(texture<unsigned char, texType, mode> texRef,
+                                                  float x, float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLod(texture<uchar1, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLod(texture<uchar2, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLod(texture<uchar4, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLod(texture<short, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLod(texture<short1, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLod(texture<short2, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLod(texture<short4, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLod(texture<unsigned short, texType, mode> texRef,
+                                                   float x, float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLod(texture<ushort1, texType, mode> texRef, float x,
+                                            float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLod(texture<ushort2, texType, mode> texRef, float x,
+                                            float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLod(texture<ushort4, texType, mode> texRef, float x,
+                                            float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLod(texture<int, texType, mode> texRef, float x, float y,
+                                        float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLod(texture<int1, texType, mode> texRef, float x, float y,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLod(texture<int2, texType, mode> texRef, float x, float y,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLod(texture<int4, texType, mode> texRef, float x, float y,
+                                         float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLod(texture<unsigned int, texType, mode> texRef,
+                                                 float x, float y, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLod(texture<uint1, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLod(texture<uint2, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLod(texture<uint4, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLod(texture<float, texType, mode> texRef, float x, float y,
+                                          float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLod(texture<float1, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLod(texture<float2, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLod(texture<float4, texType, mode> texRef, float x, float y,
+                                           float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLod(texture<char, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLod(texture<char1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLod(texture<char2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLod(texture<char4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLod(texture<unsigned char, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLod(texture<uchar1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLod(texture<uchar2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLod(texture<uchar4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLod(texture<short, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLod(texture<short1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLod(texture<short2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLod(texture<short4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLod(texture<unsigned short, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLod(texture<ushort1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLod(texture<ushort2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLod(texture<ushort4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLod(texture<int, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLod(texture<int1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLod(texture<int2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLod(texture<int4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLod(texture<unsigned int, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLod(texture<uint1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLod(texture<uint2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLod(texture<uint4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLod(texture<float, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLod(texture<float1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLod(texture<float2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLod(texture<float4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DGrad(texture<char, texType, mode> texRef, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DGrad(texture<char1, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DGrad(texture<char2, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DGrad(texture<char4, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   float x, float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DGrad(texture<uchar1, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DGrad(texture<uchar2, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DGrad(texture<uchar4, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DGrad(texture<short, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DGrad(texture<short1, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DGrad(texture<short2, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DGrad(texture<short4, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    float x, float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DGrad(texture<ushort1, texType, mode> texRef, float x,
+                                             float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DGrad(texture<ushort2, texType, mode> texRef, float x,
+                                             float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DGrad(texture<ushort4, texType, mode> texRef, float x,
+                                             float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DGrad(texture<int, texType, mode> texRef, float x, float y,
+                                         float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DGrad(texture<int1, texType, mode> texRef, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DGrad(texture<int2, texType, mode> texRef, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DGrad(texture<int4, texType, mode> texRef, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  float x, float y, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DGrad(texture<uint1, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DGrad(texture<uint2, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DGrad(texture<uint4, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DGrad(texture<float, texType, mode> texRef, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DGrad(texture<float1, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DGrad(texture<float2, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DGrad(texture<float4, texType, mode> texRef, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DGrad(texture<char, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DGrad(texture<char1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DGrad(texture<char2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DGrad(texture<char4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DGrad(texture<uchar1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DGrad(texture<uchar2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DGrad(texture<uchar4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DGrad(texture<short, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DGrad(texture<short1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DGrad(texture<short2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DGrad(texture<short4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float y, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DGrad(texture<ushort1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DGrad(texture<ushort2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DGrad(texture<ushort4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DGrad(texture<int, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DGrad(texture<int1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DGrad(texture<int2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DGrad(texture<int4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DGrad(texture<uint1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DGrad(texture<uint2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DGrad(texture<uint4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DGrad(texture<float, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DGrad(texture<float1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DGrad(texture<float2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DGrad(texture<float4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+                                          float2(dx.x, dx.y).data,
+                                          float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3D(texture<char, texType, mode> texRef, float x, float y,
+                                      float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3D(texture<char1, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3D(texture<char2, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3D(texture<char4, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3D(texture<unsigned char, texType, mode> texRef,
+                                               float x, float y, float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3D(texture<uchar1, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3D(texture<uchar2, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3D(texture<uchar4, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3D(texture<short, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3D(texture<short1, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3D(texture<short2, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3D(texture<short4, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3D(texture<unsigned short, texType, mode> texRef,
+                                                float x, float y, float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3D(texture<ushort1, texType, mode> texRef, float x, float y,
+                                         float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3D(texture<ushort2, texType, mode> texRef, float x, float y,
+                                         float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3D(texture<ushort4, texType, mode> texRef, float x, float y,
+                                         float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3D(texture<int, texType, mode> texRef, float x, float y,
+                                     float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3D(texture<int1, texType, mode> texRef, float x, float y,
+                                      float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3D(texture<int2, texType, mode> texRef, float x, float y,
+                                      float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3D(texture<int4, texType, mode> texRef, float x, float y,
+                                      float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3D(texture<unsigned int, texType, mode> texRef, float x,
+                                              float y, float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3D(texture<uint1, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3D(texture<uint2, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3D(texture<uint4, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3D(texture<float, texType, mode> texRef, float x, float y,
+                                       float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3D(texture<float1, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3D(texture<float2, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3D(texture<float4, texType, mode> texRef, float x, float y,
+                                        float z) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3D(texture<char, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3D(texture<char1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3D(texture<char2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3D(texture<char4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3D(texture<unsigned char, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3D(texture<uchar1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3D(texture<uchar2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3D(texture<uchar4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3D(texture<short, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3D(texture<short1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3D(texture<short2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3D(texture<short4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3D(texture<unsigned short, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3D(texture<ushort1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3D(texture<ushort2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3D(texture<ushort4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3D(texture<int, texType, mode> texRef,
+                                     hipTextureObject_t textureObject, float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3D(texture<int1, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3D(texture<int2, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3D(texture<int4, texType, mode> texRef,
+                                      hipTextureObject_t textureObject, float x, float y, float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3D(texture<unsigned int, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3D(texture<uint1, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3D(texture<uint2, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3D(texture<uint4, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3D(texture<float, texType, mode> texRef,
+                                       hipTextureObject_t textureObject, float x, float y,
+                                       float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3D(texture<float1, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3D(texture<float2, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3D(texture<float4, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y,
+                                        float z) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DLod(texture<char, texType, mode> texRef, float x, float y,
+                                         float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DLod(texture<char1, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DLod(texture<char2, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DLod(texture<char4, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DLod(texture<unsigned char, texType, mode> texRef,
+                                                  float x, float y, float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DLod(texture<uchar1, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DLod(texture<uchar2, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DLod(texture<uchar4, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DLod(texture<int, texType, mode> texRef, float x, float y,
+                                        float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DLod(texture<int1, texType, mode> texRef, float x, float y,
+                                         float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DLod(texture<int2, texType, mode> texRef, float x, float y,
+                                         float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DLod(texture<int4, texType, mode> texRef, float x, float y,
+                                         float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DLod(texture<unsigned int, texType, mode> texRef,
+                                                 float x, float y, float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DLod(texture<uint1, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DLod(texture<uint2, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DLod(texture<uint4, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DLod(texture<float, texType, mode> texRef, float x, float y,
+                                          float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DLod(texture<float1, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DLod(texture<float2, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DLod(texture<float4, texType, mode> texRef, float x, float y,
+                                           float z, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DLod(texture<char, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DLod(texture<char1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DLod(texture<char2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DLod(texture<char4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DLod(texture<unsigned char, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DLod(texture<uchar1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DLod(texture<uchar2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DLod(texture<uchar4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DLod(texture<int, texType, mode> texRef,
+                                        hipTextureObject_t textureObject, float x, float y, float z,
+                                        float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DLod(texture<int1, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DLod(texture<int2, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DLod(texture<int4, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DLod(texture<unsigned int, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DLod(texture<uint1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DLod(texture<uint2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DLod(texture<uint4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DLod(texture<float, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DLod(texture<float1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DLod(texture<float2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DLod(texture<float4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+                                         level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DGrad(texture<char, texType, mode> texRef, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DGrad(texture<char1, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DGrad(texture<char2, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DGrad(texture<char4, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   float x, float y, float z, float4 dx,
+                                                   float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DGrad(texture<uchar1, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DGrad(texture<uchar2, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DGrad(texture<uchar4, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3DGrad(texture<short, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3DGrad(texture<short1, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3DGrad(texture<short2, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3DGrad(texture<short4, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    float x, float y, float z, float4 dx,
+                                                    float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3DGrad(texture<ushort1, texType, mode> texRef, float x,
+                                             float y, float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3DGrad(texture<ushort2, texType, mode> texRef, float x,
+                                             float y, float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3DGrad(texture<ushort4, texType, mode> texRef, float x,
+                                             float y, float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DGrad(texture<int, texType, mode> texRef, float x, float y,
+                                         float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DGrad(texture<int1, texType, mode> texRef, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DGrad(texture<int2, texType, mode> texRef, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DGrad(texture<int4, texType, mode> texRef, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  float x, float y, float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DGrad(texture<uint1, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DGrad(texture<uint2, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DGrad(texture<uint4, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DGrad(texture<float, texType, mode> texRef, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DGrad(texture<float1, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DGrad(texture<float2, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DGrad(texture<float4, texType, mode> texRef, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DGrad(texture<char, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DGrad(texture<char1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DGrad(texture<char2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DGrad(texture<char4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DGrad(texture<unsigned char, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DGrad(texture<uchar1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DGrad(texture<uchar2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DGrad(texture<uchar4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3DGrad(texture<short, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3DGrad(texture<short1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3DGrad(texture<short2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3DGrad(texture<short4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3DGrad(texture<unsigned short, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float y, float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3DGrad(texture<ushort1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3DGrad(texture<ushort2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3DGrad(texture<ushort4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DGrad(texture<int, texType, mode> texRef,
+                                         hipTextureObject_t textureObject, float x, float y,
+                                         float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DGrad(texture<int1, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DGrad(texture<int2, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DGrad(texture<int4, texType, mode> texRef,
+                                          hipTextureObject_t textureObject, float x, float y,
+                                          float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DGrad(texture<unsigned int, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DGrad(texture<uint1, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DGrad(texture<uint2, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DGrad(texture<uint4, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DGrad(texture<float, texType, mode> texRef,
+                                           hipTextureObject_t textureObject, float x, float y,
+                                           float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DGrad(texture<float1, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DGrad(texture<float2, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DGrad(texture<float4, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            float z, float4 dx, float4 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+                                    float4(dx.x, dx.y, dx.z, dx.w).data,
+                                    float4(dy.x, dy.y, dy.z, dy.w).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayered(texture<char, texType, mode> texRef, float x,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayered(texture<char1, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayered(texture<char2, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayered(texture<char4, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayered(texture<unsigned char, texType, mode> texRef,
+                                                      float x, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayered(texture<uchar1, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayered(texture<uchar2, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayered(texture<uchar4, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayered(texture<short, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayered(texture<short1, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayered(texture<short2, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayered(texture<short4, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayered(
+    texture<unsigned short, texType, mode> texRef, float x, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayered(texture<ushort1, texType, mode> texRef, float x,
+                                                int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayered(texture<ushort2, texType, mode> texRef, float x,
+                                                int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayered(texture<ushort4, texType, mode> texRef, float x,
+                                                int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayered(texture<int, texType, mode> texRef, float x,
+                                            int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayered(texture<int1, texType, mode> texRef, float x,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayered(texture<int2, texType, mode> texRef, float x,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayered(texture<int4, texType, mode> texRef, float x,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayered(texture<unsigned int, texType, mode> texRef,
+                                                     float x, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayered(texture<uint1, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayered(texture<uint2, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayered(texture<uint4, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayered(texture<float, texType, mode> texRef, float x,
+                                              int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayered(texture<float1, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayered(texture<float2, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayered(texture<float4, texType, mode> texRef, float x,
+                                               int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayered(texture<char, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayered(texture<char1, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayered(texture<char2, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayered(texture<char4, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayered(texture<unsigned char, texType, mode> texRef,
+                                                      hipTextureObject_t textureObject, float x,
+                                                      int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayered(texture<uchar1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayered(texture<uchar2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayered(texture<uchar4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayered(texture<short, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayered(texture<short1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayered(texture<short2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayered(texture<short4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayered(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayered(texture<ushort1, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayered(texture<ushort2, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayered(texture<ushort4, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayered(texture<int, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayered(texture<int1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayered(texture<int2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayered(texture<int4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayered(texture<unsigned int, texType, mode> texRef,
+                                                     hipTextureObject_t textureObject, float x,
+                                                     int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayered(texture<uint1, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayered(texture<uint2, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayered(texture<uint4, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayered(texture<float, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayered(texture<float1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayered(texture<float2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayered(texture<float4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredLod(texture<char, texType, mode> texRef, float x,
+                                                int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredLod(texture<char1, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredLod(texture<char2, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredLod(texture<char4, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredLod(
+    texture<unsigned char, texType, mode> texRef, float x, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredLod(texture<uchar1, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredLod(texture<uchar2, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredLod(texture<uchar4, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredLod(texture<short, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredLod(texture<short1, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredLod(texture<short2, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredLod(texture<short4, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredLod(
+    texture<unsigned short, texType, mode> texRef, float x, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredLod(texture<ushort1, texType, mode> texRef, float x,
+                                                   int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredLod(texture<ushort2, texType, mode> texRef, float x,
+                                                   int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredLod(texture<ushort4, texType, mode> texRef, float x,
+                                                   int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredLod(texture<int, texType, mode> texRef, float x,
+                                               int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredLod(texture<int1, texType, mode> texRef, float x,
+                                                int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredLod(texture<int2, texType, mode> texRef, float x,
+                                                int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredLod(texture<int4, texType, mode> texRef, float x,
+                                                int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredLod(texture<unsigned int, texType, mode> texRef,
+                                                        float x, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredLod(texture<uint1, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredLod(texture<uint2, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredLod(texture<uint4, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredLod(texture<float, texType, mode> texRef, float x,
+                                                 int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredLod(texture<float1, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredLod(texture<float2, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredLod(texture<float4, texType, mode> texRef, float x,
+                                                  int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredLod(texture<char, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredLod(texture<char1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredLod(texture<char2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredLod(texture<char4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredLod(
+    texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredLod(texture<uchar1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredLod(texture<uchar2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredLod(texture<uchar4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredLod(texture<short, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredLod(texture<short1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredLod(texture<short2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredLod(texture<short4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredLod(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredLod(texture<ushort1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredLod(texture<ushort2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredLod(texture<ushort4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredLod(texture<int, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, int layer,
+                                               float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredLod(texture<int1, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredLod(texture<int2, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredLod(texture<int4, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredLod(texture<unsigned int, texType, mode> texRef,
+                                                        hipTextureObject_t textureObject, float x,
+                                                        int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredLod(texture<uint1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredLod(texture<uint2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredLod(texture<uint4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredLod(texture<float, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredLod(texture<float1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredLod(texture<float2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredLod(texture<float4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredGrad(texture<char, texType, mode> texRef, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredGrad(texture<char, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredGrad(texture<char1, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredGrad(texture<char1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredGrad(texture<char2, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredGrad(texture<char2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredGrad(texture<char4, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredGrad(texture<char4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredGrad(
+    texture<unsigned char, texType, mode> texRef, float x, int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredGrad(
+    texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredGrad(texture<uchar1, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredGrad(texture<uchar1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredGrad(texture<uchar2, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredGrad(texture<uchar2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredGrad(texture<uchar4, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredGrad(texture<uchar4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredGrad(texture<short, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredGrad(texture<short, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredGrad(texture<short1, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredGrad(texture<short1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredGrad(texture<short2, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredGrad(texture<short2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredGrad(texture<short4, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredGrad(texture<short4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredGrad(
+    texture<unsigned short, texType, mode> texRef, float x, int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredGrad(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredGrad(texture<ushort1, texType, mode> texRef, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredGrad(texture<ushort1, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredGrad(texture<ushort2, texType, mode> texRef, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredGrad(texture<ushort2, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredGrad(texture<ushort4, texType, mode> texRef, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredGrad(texture<ushort4, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredGrad(texture<int, texType, mode> texRef, float x,
+                                                int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredGrad(texture<int, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x,
+                                                int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredGrad(texture<int1, texType, mode> texRef, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredGrad(texture<int1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredGrad(texture<int2, texType, mode> texRef, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredGrad(texture<int2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredGrad(texture<int4, texType, mode> texRef, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredGrad(texture<int4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x,
+                                                 int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredGrad(
+    texture<unsigned int, texType, mode> texRef, float x, int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredGrad(
+    texture<unsigned int, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredGrad(texture<uint1, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredGrad(texture<uint1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredGrad(texture<uint2, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredGrad(texture<uint2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredGrad(texture<uint4, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredGrad(texture<uint4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredGrad(texture<float, texType, mode> texRef, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredGrad(texture<float, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredGrad(texture<float1, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredGrad(texture<float1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredGrad(texture<float2, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredGrad(texture<float2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredGrad(texture<float4, texType, mode> texRef, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredGrad(texture<float4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   int layer, float dx, float dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayered(texture<char, texType, mode> texRef, float x, float y,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayered(texture<char, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayered(texture<char1, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayered(texture<char1, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayered(texture<char2, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayered(texture<char2, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayered(texture<char4, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayered(texture<char4, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayered(texture<unsigned char, texType, mode> texRef,
+                                                      float x, float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayered(texture<unsigned char, texType, mode> texRef,
+                                                      hipTextureObject_t textureObject, float x,
+                                                      float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayered(texture<uchar1, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayered(texture<uchar1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayered(texture<uchar2, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayered(texture<uchar2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayered(texture<uchar4, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayered(texture<uchar4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayered(texture<short, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayered(texture<short, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayered(texture<short1, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayered(texture<short1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayered(texture<short2, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayered(texture<short2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayered(texture<short4, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayered(texture<short4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayered(
+    texture<unsigned short, texType, mode> texRef, float x, float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayered(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayered(texture<ushort1, texType, mode> texRef, float x,
+                                                float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayered(texture<ushort1, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayered(texture<ushort2, texType, mode> texRef, float x,
+                                                float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayered(texture<ushort2, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayered(texture<ushort4, texType, mode> texRef, float x,
+                                                float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayered(texture<ushort4, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayered(texture<int, texType, mode> texRef, float x, float y,
+                                            int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayered(texture<int, texType, mode> texRef,
+                                            hipTextureObject_t textureObject, float x, float y,
+                                            int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayered(texture<int1, texType, mode> texRef, float x, float y,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayered(texture<int1, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayered(texture<int2, texType, mode> texRef, float x, float y,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayered(texture<int2, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayered(texture<int4, texType, mode> texRef, float x, float y,
+                                             int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayered(texture<int4, texType, mode> texRef,
+                                             hipTextureObject_t textureObject, float x, float y,
+                                             int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayered(texture<unsigned int, texType, mode> texRef,
+                                                     float x, float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayered(texture<unsigned int, texType, mode> texRef,
+                                                     hipTextureObject_t textureObject, float x,
+                                                     float y, int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayered(texture<uint1, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayered(texture<uint1, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayered(texture<uint2, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayered(texture<uint2, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayered(texture<uint4, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayered(texture<uint4, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayered(texture<float, texType, mode> texRef, float x,
+                                              float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayered(texture<float, texType, mode> texRef,
+                                              hipTextureObject_t textureObject, float x, float y,
+                                              int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayered(texture<float1, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayered(texture<float1, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayered(texture<float2, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayered(texture<float2, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayered(texture<float4, texType, mode> texRef, float x,
+                                               float y, int layer) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayered(texture<float4, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredLod(texture<char, texType, mode> texRef, float x,
+                                                float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredLod(texture<char, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredLod(texture<char1, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredLod(texture<char1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredLod(texture<char2, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredLod(texture<char2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredLod(texture<char4, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredLod(texture<char4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredLod(
+    texture<unsigned char, texType, mode> texRef, float x, float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredLod(
+    texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredLod(texture<uchar1, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredLod(texture<uchar1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredLod(texture<uchar2, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredLod(texture<uchar2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredLod(texture<uchar4, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredLod(texture<uchar4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredLod(texture<short, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredLod(texture<short, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredLod(texture<short1, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredLod(texture<short1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredLod(texture<short2, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredLod(texture<short2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredLod(texture<short4, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredLod(texture<short4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredLod(
+    texture<unsigned short, texType, mode> texRef, float x, float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredLod(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredLod(texture<ushort1, texType, mode> texRef, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredLod(texture<ushort1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredLod(texture<ushort2, texType, mode> texRef, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredLod(texture<ushort2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredLod(texture<ushort4, texType, mode> texRef, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredLod(texture<ushort4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredLod(texture<int, texType, mode> texRef, float x, float y,
+                                               int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredLod(texture<int, texType, mode> texRef,
+                                               hipTextureObject_t textureObject, float x, float y,
+                                               int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredLod(texture<int1, texType, mode> texRef, float x,
+                                                float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredLod(texture<int1, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredLod(texture<int2, texType, mode> texRef, float x,
+                                                float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredLod(texture<int2, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredLod(texture<int4, texType, mode> texRef, float x,
+                                                float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredLod(texture<int4, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredLod(texture<unsigned int, texType, mode> texRef,
+                                                        float x, float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredLod(texture<unsigned int, texType, mode> texRef,
+                                                        hipTextureObject_t textureObject, float x,
+                                                        float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredLod(texture<uint1, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredLod(texture<uint1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredLod(texture<uint2, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredLod(texture<uint2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredLod(texture<uint4, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredLod(texture<uint4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredLod(texture<float, texType, mode> texRef, float x,
+                                                 float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredLod(texture<float, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredLod(texture<float1, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredLod(texture<float1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredLod(texture<float2, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredLod(texture<float2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredLod(texture<float4, texType, mode> texRef, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredLod(texture<float4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float level) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f = __ockl_image_sample_lod_2Da(
+        i, s, float4(x, y, layer, 0.0f).data, level);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredGrad(texture<char, texType, mode> texRef, float x,
+                                                 float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredGrad(texture<char, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredGrad(texture<char1, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredGrad(texture<char1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredGrad(texture<char2, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredGrad(texture<char2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredGrad(texture<char4, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredGrad(texture<char4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredGrad(
+    texture<unsigned char, texType, mode> texRef, float x, float y, int layer, float2 dx,
+    float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredGrad(
+    texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredGrad(texture<uchar1, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredGrad(texture<uchar1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredGrad(texture<uchar2, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredGrad(texture<uchar2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredGrad(texture<uchar4, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredGrad(texture<uchar4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredGrad(texture<short, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredGrad(texture<short, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredGrad(texture<short1, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredGrad(texture<short1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredGrad(texture<short2, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredGrad(texture<short2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredGrad(texture<short4, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredGrad(texture<short4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredGrad(
+    texture<unsigned short, texType, mode> texRef, float x, float y, int layer, float2 dx,
+    float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredGrad(
+    texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredGrad(texture<ushort1, texType, mode> texRef, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredGrad(texture<ushort1, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredGrad(texture<ushort2, texType, mode> texRef, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredGrad(texture<ushort2, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredGrad(texture<ushort4, texType, mode> texRef, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredGrad(texture<ushort4, texType, mode> texRef,
+                                                    hipTextureObject_t textureObject, float x,
+                                                    float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredGrad(texture<int, texType, mode> texRef, float x,
+                                                float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredGrad(texture<int, texType, mode> texRef,
+                                                hipTextureObject_t textureObject, float x, float y,
+                                                int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredGrad(texture<int1, texType, mode> texRef, float x,
+                                                 float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredGrad(texture<int1, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredGrad(texture<int2, texType, mode> texRef, float x,
+                                                 float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredGrad(texture<int2, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredGrad(texture<int4, texType, mode> texRef, float x,
+                                                 float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredGrad(texture<int4, texType, mode> texRef,
+                                                 hipTextureObject_t textureObject, float x, float y,
+                                                 int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredGrad(
+    texture<unsigned int, texType, mode> texRef, float x, float y, int layer, float2 dx,
+    float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredGrad(
+    texture<unsigned int, texType, mode> texRef, hipTextureObject_t textureObject, float x, float y,
+    int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredGrad(texture<uint1, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredGrad(texture<uint1, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredGrad(texture<uint2, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredGrad(texture<uint2, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredGrad(texture<uint4, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredGrad(texture<uint4, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredGrad(texture<float, texType, mode> texRef, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredGrad(texture<float, texType, mode> texRef,
+                                                  hipTextureObject_t textureObject, float x,
+                                                  float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredGrad(texture<float1, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredGrad(texture<float1, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredGrad(texture<float2, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredGrad(texture<float2, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredGrad(texture<float4, texType, mode> texRef, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_REF_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredGrad(texture<float4, texType, mode> texRef,
+                                                   hipTextureObject_t textureObject, float x,
+                                                   float y, int layer, float2 dx, float2 dy) {
+    TEXTURE_PARAMETERS_INIT;
+    texel.f =
+        __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+                                     float2(dx.x, dx.y).data,
+                                     float2(dy.x, dy.y).data);
+    TEXTURE_RETURN_FLOAT_XYZW;
+}
+#endif
diff --git a/include/hip/amd_detail/texture_indirect_functions.h b/include/hip/amd_detail/texture_indirect_functions.h
new file mode 100644
index 0000000000..87279da8c0
--- /dev/null
+++ b/include/hip/amd_detail/texture_indirect_functions.h
@@ -0,0 +1,503 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/ockl_image.h>
+
+#if !defined(__HIPCC_RTC__)
+#include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_OBJECT_PARAMETERS_INIT                                                            \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_itex_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value ||
+        std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value ||
+        std::is_same<T, float>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_itex_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_itex_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_load_1Db(i, x);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
+{
+    *ptr = tex1Dfetch<T>(textureObject, x);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1D(hipTextureObject_t textureObject, float x)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1D(i, s, x);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
+{
+    *ptr = tex1D<T>(textureObject, x);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2D(hipTextureObject_t textureObject, float x, float y)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
+{
+    *ptr = tex2D<T>(textureObject, x, y);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+    *ptr = tex3D<T>(textureObject, x, y, z);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
+{
+    *ptr = tex1DLayered<T>(textureObject, x, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
+{
+    *ptr = tex1DLayered<T>(textureObject, x, y, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__  T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+    *ptr = texCubemap<T>(textureObject, x, y, z);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+    *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    switch (comp) {
+    case 1: {
+        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<T*>(&tmp);
+        break;
+    }
+    case 2: {
+        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<T*>(&tmp);
+        break;
+    }
+    case 3: {
+        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<T*>(&tmp);
+        break;
+    }
+    default: {
+        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+        return *reinterpret_cast<T*>(&tmp);
+        break;
+    }
+    };
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+    *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
+{
+    *ptr = tex1DLod<T>(textureObject, x, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
+{
+    *ptr = tex2DLod<T>(textureObject, x, y, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    *ptr = tex3DLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
+{
+    *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__  T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+    *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return *reinterpret_cast<T*>(&tmp);
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+    *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+    *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+    *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+    *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+    return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__  T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return *reinterpret_cast<T*>(&tmp);
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
+}
+
+#endif
diff --git a/include/hip/amd_detail/texture_types.h b/include/hip/amd_detail/texture_types.h
new file mode 100644
index 0000000000..3cfb7dac9f
--- /dev/null
+++ b/include/hip/amd_detail/texture_types.h
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_TEXTURE_TYPES_H
+
+#include <hip/amd_detail/driver_types.h>
+
+#define hipTextureType1D 0x01
+#define hipTextureType2D 0x02
+#define hipTextureType3D 0x03
+#define hipTextureTypeCubemap 0x0C
+#define hipTextureType1DLayered 0xF1
+#define hipTextureType2DLayered 0xF2
+#define hipTextureTypeCubemapLayered 0xFC
+
+/**
+ * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
+ */
+#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
+#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
+#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
+#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
+
+/**
+ * An opaque value that represents a hip texture object
+ */
+struct __hip_texture;
+typedef struct __hip_texture* hipTextureObject_t;
+
+/**
+ * hip texture address modes
+ */
+enum hipTextureAddressMode {
+    hipAddressModeWrap = 0,
+    hipAddressModeClamp = 1,
+    hipAddressModeMirror = 2,
+    hipAddressModeBorder = 3
+};
+
+/**
+ * hip texture filter modes
+ */
+enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
+
+/**
+ * hip texture read modes
+ */
+enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
+
+/**
+ * hip texture reference
+ */
+typedef struct textureReference {
+    int normalized;
+    enum hipTextureReadMode readMode;// used only for driver API's
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    struct hipChannelFormatDesc channelDesc;
+    int sRGB;                    // Perform sRGB->linear conversion during texture read
+    unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+
+    hipTextureObject_t textureObject;
+    int numChannels;
+    enum hipArray_Format format;
+}textureReference;
+
+/**
+ * hip texture descriptor
+ */
+typedef struct hipTextureDesc {
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureReadMode readMode;
+    int sRGB;  // Perform sRGB->linear conversion during texture read
+    float borderColor[4];
+    int normalizedCoords;
+    unsigned int maxAnisotropy;
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+}hipTextureDesc;
+
+#endif
diff --git a/include/hip/channel_descriptor.h b/include/hip/channel_descriptor.h
index e012bce469..47d842fd3e 100644
--- a/include/hip/channel_descriptor.h
+++ b/include/hip/channel_descriptor.h
@@ -29,9 +29,9 @@ THE SOFTWARE.
 
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_channel_descriptor.h>
+#include <hip/amd_detail/channel_descriptor.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/nvidia_detail/nvidia_channel_descriptor.h>
+#include <hip/nvidia_detail/channel_descriptor.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/device_functions.h b/include/hip/device_functions.h
index 96bbb05036..585d986c7d 100644
--- a/include/hip/device_functions.h
+++ b/include/hip/device_functions.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_device_functions.h>
+#include <hip/amd_detail/device_functions.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include <device_functions.h>
 #else
diff --git a/include/hip/driver_types.h b/include/hip/driver_types.h
index 18313d70de..1a8aa0daa8 100644
--- a/include/hip/driver_types.h
+++ b/include/hip/driver_types.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_driver_types.h>
+#include <hip/amd_detail/driver_types.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include "driver_types.h"
 #else
diff --git a/include/hip/hcc_detail b/include/hip/hcc_detail
new file mode 120000
index 0000000000..4931d48978
--- /dev/null
+++ b/include/hip/hcc_detail
@@ -0,0 +1 @@
+amd_detail
\ No newline at end of file
diff --git a/include/hip/hip_complex.h b/include/hip/hip_complex.h
index c58e8c77c8..89943a6cc0 100644
--- a/include/hip/hip_complex.h
+++ b/include/hip/hip_complex.h
@@ -26,9 +26,9 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_hip_complex.h>
+#include <hip/amd_detail/hip_complex.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/nvidia_detail/nvidia_hip_complex.h>
+#include <hip/nvidia_detail/hip_complex.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/hip_cooperative_groups.h b/include/hip/hip_cooperative_groups.h
index afc1fe799d..cff88d4217 100644
--- a/include/hip/hip_cooperative_groups.h
+++ b/include/hip/hip_cooperative_groups.h
@@ -35,10 +35,10 @@ THE SOFTWARE.
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #if __cplusplus && defined(__clang__) && defined(__HIP__)
-#include <hip/amd_detail/amd_hip_cooperative_groups.h>
+#include <hip/amd_detail/hip_cooperative_groups.h>
 #endif
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/nvidia_detail/nvidia_hip_cooperative_groups.h>
+#include <hip/nvidia_detail/hip_cooperative_groups.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/hip_fp16.h b/include/hip/hip_fp16.h
index 332be4d263..626ce64c7d 100644
--- a/include/hip/hip_fp16.h
+++ b/include/hip/hip_fp16.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_hip_fp16.h>
+#include <hip/amd_detail/hip_fp16.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include "cuda_fp16.h"
 #else
diff --git a/include/hip/hip_runtime.h b/include/hip/hip_runtime.h
index 00fe21daf7..73dd87226f 100644
--- a/include/hip/hip_runtime.h
+++ b/include/hip/hip_runtime.h
@@ -59,9 +59,9 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_hip_runtime.h>
+#include <hip/amd_detail/hip_runtime.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/nvidia_detail/nvidia_hip_runtime.h>
+#include <hip/nvidia_detail/hip_runtime.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 30f12bc329..7cd26bc1e4 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -456,3896 +456,9 @@ enum hipComputeMode {
  */
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-
-#include <stdint.h>
-#include <stddef.h>
-#ifndef GENERIC_GRID_LAUNCH
-#define GENERIC_GRID_LAUNCH 1
-#endif
-#include <hip/amd_detail/host_defines.h>
-#include <hip/amd_detail/amd_driver_types.h>
-#include <hip/amd_detail/amd_hip_texture_types.h>
-#include <hip/amd_detail/amd_hip_surface_types.h>
-#if defined(_MSC_VER)
-#define DEPRECATED(msg) __declspec(deprecated(msg))
-#else // !defined(_MSC_VER)
-#define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
-#endif // !defined(_MSC_VER)
-#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"
-#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
-#define HIP_LAUNCH_PARAM_END ((void*)0x03)
-#ifdef __cplusplus
-  #define __dparm(x) \
-          = x
-#else
-  #define __dparm(x)
-#endif
-#ifdef __GNUC__
-#pragma GCC visibility push (default)
-#endif
-#ifdef __cplusplus
-namespace hip_impl {
-hipError_t hip_init();
-}  // namespace hip_impl
-#endif
-// Structure definitions:
-#ifdef __cplusplus
-extern "C" {
-#endif
-//---
-// API-visible structures
-typedef struct ihipCtx_t* hipCtx_t;
-// Note many APIs also use integer deviceIds as an alternative to the device pointer:
-typedef int hipDevice_t;
-typedef enum hipDeviceP2PAttr {
-  hipDevP2PAttrPerformanceRank = 0,
-  hipDevP2PAttrAccessSupported,
-  hipDevP2PAttrNativeAtomicSupported,
-  hipDevP2PAttrHipArrayAccessSupported
-} hipDeviceP2PAttr;
-typedef struct ihipStream_t* hipStream_t;
-#define hipIpcMemLazyEnablePeerAccess 0
-#define HIP_IPC_HANDLE_SIZE 64
-typedef struct hipIpcMemHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcMemHandle_t;
-typedef struct hipIpcEventHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcEventHandle_t;
-typedef struct ihipModule_t* hipModule_t;
-typedef struct ihipModuleSymbol_t* hipFunction_t;
-typedef struct hipFuncAttributes {
-    int binaryVersion;
-    int cacheModeCA;
-    size_t constSizeBytes;
-    size_t localSizeBytes;
-    int maxDynamicSharedSizeBytes;
-    int maxThreadsPerBlock;
-    int numRegs;
-    int preferredShmemCarveout;
-    int ptxVersion;
-    size_t sharedSizeBytes;
-} hipFuncAttributes;
-typedef struct ihipEvent_t* hipEvent_t;
-enum hipLimit_t {
-    hipLimitMallocHeapSize = 0x02,
-};
-/**
- * @addtogroup GlobalDefs More
- * @{
- */
-//! Flags that can be used with hipStreamCreateWithFlags
-#define hipStreamDefault                                                                           \
-    0x00  ///< Default stream creation flags. These are used with hipStreamCreate().
-#define hipStreamNonBlocking 0x01  ///< Stream does not implicitly synchronize with null stream
-//! Flags that can be used with hipEventCreateWithFlags:
-#define hipEventDefault 0x0  ///< Default flags
-#define hipEventBlockingSync                                                                       \
-    0x1  ///< Waiting will yield CPU.  Power-friendly and usage-friendly but may increase latency.
-#define hipEventDisableTiming                                                                      \
-    0x2  ///< Disable event's capability to record timing information.  May improve performance.
-#define hipEventInterprocess 0x4  ///< Event can support IPC.  @warning - not supported in HIP.
-#define hipEventReleaseToDevice                                                                    \
-    0x40000000  /// < Use a device-scope release when recording this event.  This flag is useful to
-                /// obtain more precise timings of commands between events.  The flag is a no-op on
-                /// CUDA platforms.
-#define hipEventReleaseToSystem                                                                    \
-    0x80000000  /// < Use a system-scope release when recording this event.  This flag is
-                /// useful to make non-coherent host memory visible to the host.  The flag is a
-                /// no-op on CUDA platforms.
-//! Flags that can be used with hipHostMalloc
-#define hipHostMallocDefault 0x0
-#define hipHostMallocPortable 0x1  ///< Memory is considered allocated by all contexts.
-#define hipHostMallocMapped                                                                        \
-    0x2  ///< Map the allocation into the address space for the current device.  The device pointer
-         ///< can be obtained with #hipHostGetDevicePointer.
-#define hipHostMallocWriteCombined 0x4
-#define hipHostMallocNumaUser                                                                      \
-    0x20000000  ///< Host memory allocation will follow numa policy set by user
-#define hipHostMallocCoherent                                                                      \
-    0x40000000  ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
-                ///< allocation.
-#define hipHostMallocNonCoherent                                                                   \
-    0x80000000  ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
-                ///< allocation.
-#define hipMemAttachGlobal  0x01    ///< Memory can be accessed by any stream on any device
-#define hipMemAttachHost    0x02    ///< Memory cannot be accessed by any stream on any device
-#define hipMemAttachSingle  0x04    ///< Memory can only be accessed by a single stream on
-                                    ///< the associated device
-#define hipDeviceMallocDefault 0x0
-#define hipDeviceMallocFinegrained 0x1  ///< Memory is allocated in fine grained region of device.
-#define hipMallocSignalMemory 0x2       ///< Memory represents a HSA signal.
-//! Flags that can be used with hipHostRegister
-#define hipHostRegisterDefault 0x0   ///< Memory is Mapped and Portable
-#define hipHostRegisterPortable 0x1  ///< Memory is considered registered by all contexts.
-#define hipHostRegisterMapped                                                                      \
-    0x2  ///< Map the allocation into the address space for the current device.  The device pointer
-         ///< can be obtained with #hipHostGetDevicePointer.
-#define hipHostRegisterIoMemory 0x4  ///< Not supported.
-#define hipExtHostRegisterCoarseGrained 0x8  ///< Coarse Grained host memory lock
-#define hipDeviceScheduleAuto 0x0  ///< Automatically select between Spin and Yield
-#define hipDeviceScheduleSpin                                                                      \
-    0x1  ///< Dedicate a CPU core to spin-wait.  Provides lowest latency, but burns a CPU core and
-         ///< may consume more power.
-#define hipDeviceScheduleYield                                                                     \
-    0x2  ///< Yield the CPU to the operating system when waiting.  May increase latency, but lowers
-         ///< power and is friendlier to other threads in the system.
-#define hipDeviceScheduleBlockingSync 0x4
-#define hipDeviceScheduleMask 0x7
-#define hipDeviceMapHost 0x8
-#define hipDeviceLmemResizeToMax 0x16
-#define hipArrayDefault 0x00  ///< Default HIP array allocation flag
-#define hipArrayLayered 0x01
-#define hipArraySurfaceLoadStore 0x02
-#define hipArrayCubemap 0x04
-#define hipArrayTextureGather 0x08
-#define hipOccupancyDefault 0x00
-#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
-#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
-#define hipCpuDeviceId ((int)-1)
-#define hipInvalidDeviceId ((int)-2)
-// Flags that can be used with hipExtLaunch Set of APIs
-#define hipExtAnyOrderLaunch 0x01  ///< AnyOrderLaunch of kernels
-// Flags to be used with hipStreamWaitValue32 and hipStreamWaitValue64
-#define hipStreamWaitValueGte 0x0
-#define hipStreamWaitValueEq 0x1
-#define hipStreamWaitValueAnd 0x2
-#define hipStreamWaitValueNor 0x3
-/*
- * @brief HIP Memory Advise values
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipMemoryAdvise {
-    hipMemAdviseSetReadMostly = 1,          ///< Data will mostly be read and only occassionally
-                                            ///< be written to
-    hipMemAdviseUnsetReadMostly = 2,        ///< Undo the effect of hipMemAdviseSetReadMostly
-    hipMemAdviseSetPreferredLocation = 3,   ///< Set the preferred location for the data as
-                                            ///< the specified device
-    hipMemAdviseUnsetPreferredLocation = 4, ///< Clear the preferred location for the data
-    hipMemAdviseSetAccessedBy = 5,          ///< Data will be accessed by the specified device,
-                                            ///< so prevent page faults as much as possible
-    hipMemAdviseUnsetAccessedBy = 6,        ///< Let HIP to decide on the page faulting policy
-                                            ///< for the specified device
-    hipMemAdviseSetCoarseGrain = 100,       ///< The default memory model is fine-grain. That allows
-                                            ///< coherent operations between host and device, while
-                                            ///< executing kernels. The coarse-grain can be used
-                                            ///< for data that only needs to be coherent at dispatch
-                                            ///< boundaries for better performance.
-    hipMemAdviseUnsetCoarseGrain = 101      ///< Restores cache coherency policy back to fine-grain
-} hipMemoryAdvise;
-/*
- * @brief HIP range attributes
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipMemRangeAttribute {
-    hipMemRangeAttributeReadMostly = 1,         ///< Whether the range will mostly be read and
-                                                ///< only occassionally be written to
-    hipMemRangeAttributePreferredLocation = 2,  ///< The preferred location of the range
-    hipMemRangeAttributeAccessedBy = 3,         ///< Memory range has hipMemAdviseSetAccessedBy
-                                                ///< set for the specified device
-    hipMemRangeAttributeLastPrefetchLocation = 4,///< The last location to where the range was prefetched
-} hipMemRangeAttribute;
-/*
- * @brief hipJitOption
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
-} hipJitOption;
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncAttribute {
-    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
-    hipFuncAttributePreferredSharedMemoryCarveout = 9,
-    hipFuncAttributeMax
-} hipFuncAttribute;
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncCache_t {
-    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
-    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
-    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
-    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
-} hipFuncCache_t;
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipSharedMemConfig {
-    hipSharedMemBankSizeDefault,  ///< The compiler selects a device-specific value for the banking.
-    hipSharedMemBankSizeFourByte,  ///< Shared mem is banked at 4-bytes intervals and performs best
-                                   ///< when adjacent threads access data 4 bytes apart.
-    hipSharedMemBankSizeEightByte  ///< Shared mem is banked at 8-byte intervals and performs best
-                                   ///< when adjacent threads access data 4 bytes apart.
-} hipSharedMemConfig;
-/**
- * Struct for data in 3D
- *
- */
-typedef struct dim3 {
-    uint32_t x;  ///< x
-    uint32_t y;  ///< y
-    uint32_t z;  ///< z
-#ifdef __cplusplus
-    constexpr __host__ __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
-#endif
-} dim3;
-typedef struct hipLaunchParams_t {
-    void* func;             ///< Device function symbol
-    dim3 gridDim;           ///< Grid dimentions
-    dim3 blockDim;          ///< Block dimentions
-    void **args;            ///< Arguments
-    size_t sharedMem;       ///< Shared memory
-    hipStream_t stream;     ///< Stream identifier
-} hipLaunchParams;
-typedef enum hipExternalMemoryHandleType_enum {
-  hipExternalMemoryHandleTypeOpaqueFd = 1,
-  hipExternalMemoryHandleTypeOpaqueWin32 = 2,
-  hipExternalMemoryHandleTypeOpaqueWin32Kmt = 3,
-  hipExternalMemoryHandleTypeD3D12Heap = 4,
-  hipExternalMemoryHandleTypeD3D12Resource = 5,
-  hipExternalMemoryHandleTypeD3D11Resource = 6,
-  hipExternalMemoryHandleTypeD3D11ResourceKmt = 7,
-} hipExternalMemoryHandleType;
-typedef struct hipExternalMemoryHandleDesc_st {
-  hipExternalMemoryHandleType type;
-  union {
-    int fd;
-    struct {
-      void *handle;
-      const void *name;
-    } win32;
-  } handle;
-  unsigned long long size;
-  unsigned int flags;
-} hipExternalMemoryHandleDesc;
-typedef struct hipExternalMemoryBufferDesc_st {
-  unsigned long long offset;
-  unsigned long long size;
-  unsigned int flags;
-} hipExternalMemoryBufferDesc;
-typedef void* hipExternalMemory_t;
-typedef enum hipExternalSemaphoreHandleType_enum {
-  hipExternalSemaphoreHandleTypeOpaqueFd = 1,
-  hipExternalSemaphoreHandleTypeOpaqueWin32 = 2,
-  hipExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
-  hipExternalSemaphoreHandleTypeD3D12Fence = 4
-} hipExternalSemaphoreHandleType;
-typedef struct hipExternalSemaphoreHandleDesc_st {
-  hipExternalSemaphoreHandleType type;
-  union {
-    int fd;
-    struct {
-      void* handle;
-      const void* name;
-    } win32;
-  } handle;
-  unsigned int flags;
-} hipExternalSemaphoreHandleDesc;
-typedef void* hipExternalSemaphore_t;
-typedef struct hipExternalSemaphoreSignalParams_st {
-  struct {
-    struct {
-      unsigned long long value;
-    } fence;
-    struct {
-      unsigned long long key;
-    } keyedMutex;
-    unsigned int reserved[12];
-  } params;
-  unsigned int flags;
-  unsigned int reserved[16];
-} hipExternalSemaphoreSignalParams;
-/**
- * External semaphore wait parameters, compatible with driver type
- */
-typedef struct hipExternalSemaphoreWaitParams_st {
-  struct {
-    struct {
-      unsigned long long value;
-    } fence;
-    struct {
-      unsigned long long key;
-      unsigned int timeoutMs;
-    } keyedMutex;
-    unsigned int reserved[10];
-  } params;
-  unsigned int flags;
-  unsigned int reserved[16];
-} hipExternalSemaphoreWaitParams;
-
-#if __HIP_HAS_GET_PCH
-/**
- * Internal use only. This API may change in the future
- * Pre-Compiled header for online compilation
- *
- */
-    void __hipGetPCH(const char** pch, unsigned int*size);
-#endif
-
-/*
-    * @brief HIP Devices used by current OpenGL Context.
-    * @enum
-    * @ingroup Enumerations
-    */
-typedef enum hipGLDeviceList {
-    hipGLDeviceListAll = 1,           ///< All hip devices used by current OpenGL context.
-    hipGLDeviceListCurrentFrame = 2,  ///< Hip devices used by current OpenGL context in current
-                                    ///< frame
-    hipGLDeviceListNextFrame = 3      ///< Hip devices used by current OpenGL context in next
-                                    ///< frame.
-} hipGLDeviceList;
-
-/*
-    * @brief HIP Access falgs for Interop resources.
-    * @enum
-    * @ingroup Enumerations
-    */
-typedef enum hipGraphicsRegisterFlags {
-    hipGraphicsRegisterFlagsNone = 0,
-    hipGraphicsRegisterFlagsReadOnly = 1,  ///< HIP will not write to this registered resource
-    hipGraphicsRegisterFlagsWriteDiscard =
-        2,  ///< HIP will only write and will not read from this registered resource
-    hipGraphicsRegisterFlagsSurfaceLoadStore = 4,  ///< HIP will bind this resource to a surface
-    hipGraphicsRegisterFlagsTextureGather =
-        8  ///< HIP will perform texture gather operations on this registered resource
-} hipGraphicsRegisterFlags;
-
-typedef struct _hipGraphicsResource hipGraphicsResource;
-
-typedef hipGraphicsResource* hipGraphicsResource_t;
-
-// Doxygen end group GlobalDefs
-/**  @} */
-//-------------------------------------------------------------------------------------------------
-// The handle allows the async commands to use the stream even if the parent hipStream_t goes
-// out-of-scope.
-// typedef class ihipStream_t * hipStream_t;
-/*
- * Opaque structure allows the true event (pointed at by the handle) to remain "live" even if the
- * surrounding hipEvent_t goes out-of-scope. This is handy for cases where the hipEvent_t goes
- * out-of-scope but the true event is being written by some async queue or device */
-// typedef struct hipEvent_t {
-//    struct ihipEvent_t *_handle;
-//} hipEvent_t;
-/**
- *  @defgroup API HIP API
- *  @{
- *
- *  Defines the HIP API.  See the individual sections for more information.
- */
-/**
- *  @defgroup Driver Initialization and Version
- *  @{
- *  This section describes the initializtion and version functions of HIP runtime API.
- *
- */
-/**
- * @brief Explicitly initializes the HIP runtime.
- *
- * Most HIP APIs implicitly initialize the HIP runtime.
- * This API provides control over the timing of the initialization.
- */
-// TODO-ctx - more description on error codes.
-hipError_t hipInit(unsigned int flags);
-/**
- * @brief Returns the approximate HIP driver version.
- *
- * @param [out] driverVersion
- *
- * @returns #hipSuccess, #hipErrorInavlidValue
- *
- * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision.
- * This function always set *driverVersion to 4 as an approximation though HIP supports
- * some features which were introduced in later CUDA SDK revisions.
- * HIP apps code should not rely on the driver revision number here and should
- * use arch feature flags to test device capabilities or conditional compilation.
- *
- * @see hipRuntimeGetVersion
- */
-hipError_t hipDriverGetVersion(int* driverVersion);
-/**
- * @brief Returns the approximate HIP Runtime version.
- *
- * @param [out] runtimeVersion
- *
- * @returns #hipSuccess, #hipErrorInavlidValue
- *
- * @warning On HIP/HCC path this function returns HIP runtime patch version however on
- * HIP/NVCC path this function return CUDA runtime version.
- *
- * @see hipDriverGetVersion
- */
-hipError_t hipRuntimeGetVersion(int* runtimeVersion);
-/**
- * @brief Returns a handle to a compute device
- * @param [out] device
- * @param [in] ordinal
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
-/**
- * @brief Returns the compute capability of the device
- * @param [out] major
- * @param [out] minor
- * @param [in] device
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
-/**
- * @brief Returns an identifer string for the device.
- * @param [out] name
- * @param [in] len
- * @param [in] device
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
-/**
- * @brief Returns a value for attr of link between two devices
- * @param [out] value
- * @param [in] attr
- * @param [in] srcDevice
- * @param [in] dstDevice
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
-                                    int srcDevice, int dstDevice);
-/**
- * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
- * @param [out] pciBusId
- * @param [in] len
- * @param [in] device
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
-/**
- * @brief Returns a handle to a compute device.
- * @param [out] device handle
- * @param [in] PCI Bus ID
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue
- */
-hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
-/**
- * @brief Returns the total amount of memory on the device.
- * @param [out] bytes
- * @param [in] device
- *
- * @returns #hipSuccess, #hipErrorInavlidDevice
- */
-hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device);
-// doxygen end initialization
-/**
- * @}
- */
-/**
- *  @defgroup Device Device Management
- *  @{
- *  This section describes the device management functions of HIP runtime API.
- */
-/**
- * @brief Waits on all active streams on current device
- *
- * When this command is invoked, the host thread gets blocked until all the commands associated
- * with streams associated with the device. HIP does not support multiple blocking modes (yet!).
- *
- * @returns #hipSuccess
- *
- * @see hipSetDevice, hipDeviceReset
- */
-hipError_t hipDeviceSynchronize(void);
-/**
- * @brief The state of current device is discarded and updated to a fresh state.
- *
- * Calling this function deletes all streams created, memory allocated, kernels running, events
- * created. Make sure that no other thread is using the device or streams, memory, kernels, events
- * associated with the current device.
- *
- * @returns #hipSuccess
- *
- * @see hipDeviceSynchronize
- */
-hipError_t hipDeviceReset(void);
-/**
- * @brief Set default device to be used for subsequent hip API calls from this thread.
- *
- * @param[in] deviceId Valid device in range 0...hipGetDeviceCount().
- *
- * Sets @p device as the default device for the calling host thread.  Valid device id's are 0...
- * (hipGetDeviceCount()-1).
- *
- * Many HIP APIs implicitly use the "default device" :
- *
- * - Any device memory subsequently allocated from this host thread (using hipMalloc) will be
- * allocated on device.
- * - Any streams or events created from this host thread will be associated with device.
- * - Any kernels launched from this host thread (using hipLaunchKernel) will be executed on device
- * (unless a specific stream is specified, in which case the device associated with that stream will
- * be used).
- *
- * This function may be called from any host thread.  Multiple host threads may use the same device.
- * This function does no synchronization with the previous or new device, and has very little
- * runtime overhead. Applications can use hipSetDevice to quickly switch the default device before
- * making a HIP runtime call which uses the default device.
- *
- * The default device is stored in thread-local-storage for each thread.
- * Thread-pool implementations may inherit the default device of the previous thread.  A good
- * practice is to always call hipSetDevice at the start of HIP coding sequency to establish a known
- * standard device.
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorDeviceAlreadyInUse
- *
- * @see hipGetDevice, hipGetDeviceCount
- */
-hipError_t hipSetDevice(int deviceId);
-/**
- * @brief Return the default device id for the calling host thread.
- *
- * @param [out] device *device is written with the default device
- *
- * HIP maintains an default device for each thread using thread-local-storage.
- * This device is used implicitly for HIP runtime APIs called by this thread.
- * hipGetDevice returns in * @p device the default device for the calling host thread.
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- * @see hipSetDevice, hipGetDevicesizeBytes
- */
-hipError_t hipGetDevice(int* deviceId);
-/**
- * @brief Return number of compute-capable devices.
- *
- * @param [output] count Returns number of compute-capable devices.
- *
- * @returns #hipSuccess, #hipErrorNoDevice
- *
- *
- * Returns in @p *count the number of devices that have ability to run compute commands.  If there
- * are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. If 1 or more
- * devices can be found, then hipGetDeviceCount returns #hipSuccess.
- */
-hipError_t hipGetDeviceCount(int* count);
-/**
- * @brief Query for a specific device attribute.
- *
- * @param [out] pi pointer to value to return
- * @param [in] attr attribute to query
- * @param [in] deviceId which device to query for information
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- */
-hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId);
-/**
- * @brief Returns device properties.
- *
- * @param [out] prop written with device properties
- * @param [in]  deviceId which device to query for information
- *
- * @return #hipSuccess, #hipErrorInvalidDevice
- * @bug HCC always returns 0 for maxThreadsPerMultiProcessor
- * @bug HCC always returns 0 for regsPerBlock
- * @bug HCC always returns 0 for l2CacheSize
- *
- * Populates hipGetDeviceProperties with information for the specified device.
- */
-hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
-/**
- * @brief Set L1/Shared cache partition.
- *
- * @param [in] cacheConfig
- *
- * @returns #hipSuccess, #hipErrorNotInitialized
- * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
- * on those architectures.
- *
- */
-hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
-/**
- * @brief Set Cache configuration for a specific function
- *
- * @param [in] cacheConfig
- *
- * @returns #hipSuccess, #hipErrorNotInitialized
- * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
- * on those architectures.
- *
- */
-hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig);
-/**
- * @brief Get Resource limits of current device
- *
- * @param [out] pValue
- * @param [in]  limit
- *
- * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
- * Note: Currently, only hipLimitMallocHeapSize is available
- *
- */
-hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
-/**
- * @brief Returns bank width of shared memory for current device
- *
- * @param [out] pConfig
- *
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- *
- * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- */
-hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
-/**
- * @brief Gets the flags set for current device
- *
- * @param [out] flags
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- */
-hipError_t hipGetDeviceFlags(unsigned int* flags);
-/**
- * @brief The bank width of shared memory on current device is set
- *
- * @param [in] config
- *
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- *
- * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- */
-hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config);
-/**
- * @brief The current device behavior is changed according the flags passed.
- *
- * @param [in] flags
- *
- * The schedule flags impact how HIP waits for the completion of a command running on a device.
- * hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted the
- * work until the command completes.  This offers the lowest latency, but will consume a CPU core
- * and may increase power. hipDeviceScheduleYield        : The HIP runtime will yield the CPU to
- * system so that other tasks can use it.  This may increase latency to detect the completion but
- * will consume less power and is friendlier to other tasks in the system.
- * hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
- * hipDeviceScheduleAuto         : Use a hueristic to select between Spin and Yield modes.  If the
- * number of HIP contexts is greater than the number of logical processors in the system, use Spin
- * scheduling.  Else use Yield scheduling.
- *
- *
- * hipDeviceMapHost              : Allow mapping host memory.  On ROCM, this is always allowed and
- * the flag is ignored. hipDeviceLmemResizeToMax      : @warning ROCm silently ignores this flag.
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
- *
- *
- */
-hipError_t hipSetDeviceFlags(unsigned flags);
-/**
- * @brief Device which matches hipDeviceProp_t is returned
- *
- * @param [out] device ID
- * @param [in]  device properties pointer
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop);
-/**
- * @brief Returns the link type and hop count between two devices
- *
- * @param [in] device1 Ordinal for device1
- * @param [in] device2 Ordinal for device2
- * @param [out] linktype Returns the link type (See hsa_amd_link_info_type_t) between the two devices
- * @param [out] hopcount Returns the hop count between the two devices
- *
- * Queries and returns the HSA link type and the hop count between the two specified devices.
- *
- * @returns #hipSuccess, #hipInvalidDevice, #hipErrorRuntimeOther
- */
-hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount);
-// TODO: implement IPC apis
-/**
- * @brief Gets an interprocess memory handle for an existing device memory
- *          allocation
- *
- * Takes a pointer to the base of an existing device memory allocation created
- * with hipMalloc and exports it for use in another process. This is a
- * lightweight operation and may be called multiple times on an allocation
- * without adverse effects.
- *
- * If a region of memory is freed with hipFree and a subsequent call
- * to hipMalloc returns memory with the same device address,
- * hipIpcGetMemHandle will return a unique handle for the
- * new memory.
- *
- * @param handle - Pointer to user allocated hipIpcMemHandle to return
- *                    the handle in.
- * @param devPtr - Base pointer to previously allocated device memory
- *
- * @returns
- * hipSuccess,
- * hipErrorInvalidHandle,
- * hipErrorOutOfMemory,
- * hipErrorMapFailed,
- *
- */
-hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr);
-/**
- * @brief Opens an interprocess memory handle exported from another process
- *          and returns a device pointer usable in the local process.
- *
- * Maps memory exported from another process with hipIpcGetMemHandle into
- * the current device address space. For contexts on different devices
- * hipIpcOpenMemHandle can attempt to enable peer access between the
- * devices as if the user called hipDeviceEnablePeerAccess. This behavior is
- * controlled by the hipIpcMemLazyEnablePeerAccess flag.
- * hipDeviceCanAccessPeer can determine if a mapping is possible.
- *
- * Contexts that may open hipIpcMemHandles are restricted in the following way.
- * hipIpcMemHandles from each device in a given process may only be opened
- * by one context per device per other process.
- *
- * Memory returned from hipIpcOpenMemHandle must be freed with
- * hipIpcCloseMemHandle.
- *
- * Calling hipFree on an exported memory region before calling
- * hipIpcCloseMemHandle in the importing context will result in undefined
- * behavior.
- *
- * @param devPtr - Returned device pointer
- * @param handle - hipIpcMemHandle to open
- * @param flags  - Flags for this operation. Must be specified as hipIpcMemLazyEnablePeerAccess
- *
- * @returns
- * hipSuccess,
- * hipErrorMapFailed,
- * hipErrorInvalidHandle,
- * hipErrorTooManyPeers
- *
- * @note No guarantees are made about the address returned in @p *devPtr.
- * In particular, multiple processes may not receive the same address for the same @p handle.
- *
- */
-hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags);
-/**
- * @brief Close memory mapped with hipIpcOpenMemHandle
- *
- * Unmaps memory returnd by hipIpcOpenMemHandle. The original allocation
- * in the exporting process as well as imported mappings in other processes
- * will be unaffected.
- *
- * Any resources used to enable peer access will be freed if this is the
- * last mapping using them.
- *
- * @param devPtr - Device pointer returned by hipIpcOpenMemHandle
- *
- * @returns
- * hipSuccess,
- * hipErrorMapFailed,
- * hipErrorInvalidHandle,
- *
- */
-hipError_t hipIpcCloseMemHandle(void* devPtr);
-hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
-hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle);
-// end doxygen Device
-/**
- * @}
- */
-/**
- *
- *  @defgroup Execution Execution Control
- *  @{
- *  This section describes the execution control functions of HIP runtime API.
- *
- */
-/**
- * @brief Set attribute for a specific function
- *
- * @param [in] func;
- * @param [in] attr;
- * @param [in] value;
- *
- * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
- *
- * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- */
-hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value);
-/**
- * @brief Set Cache configuration for a specific function
- *
- * @param [in] config;
- *
- * @returns #hipSuccess, #hipErrorNotInitialized
- * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
- * on those architectures.
- *
- */
-hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
-/**
- * @brief Set shared memory configuation for a specific function
- *
- * @param [in] func
- * @param [in] config
- *
- * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
- *
- * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- */
-hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config);
-//doxygen end execution
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Error Error Handling
- *  @{
- *  This section describes the error handling functions of HIP runtime API.
- */
-/**
- * @brief Return last error returned by any HIP runtime API call and resets the stored error code to
- * #hipSuccess
- *
- * @returns return code from last HIP called from the active host thread
- *
- * Returns the last error that has been returned by any of the runtime calls in the same host
- * thread, and then resets the saved error to #hipSuccess.
- *
- * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
- */
-hipError_t hipGetLastError(void);
-/**
- * @brief Return last error returned by any HIP runtime API call.
- *
- * @return #hipSuccess
- *
- * Returns the last error that has been returned by any of the runtime calls in the same host
- * thread. Unlike hipGetLastError, this function does not reset the saved error code.
- *
- * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
- */
-hipError_t hipPeekAtLastError(void);
-/**
- * @brief Return name of the specified error code in text form.
- *
- * @param hip_error Error code to convert to name.
- * @return const char pointer to the NULL-terminated error name
- *
- * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
- */
-const char* hipGetErrorName(hipError_t hip_error);
-/**
- * @brief Return handy text string message to explain the error which occurred
- *
- * @param hipError Error code to convert to string.
- * @return const char pointer to the NULL-terminated error string
- *
- * @warning : on HCC, this function returns the name of the error (same as hipGetErrorName)
- *
- * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
- */
-const char* hipGetErrorString(hipError_t hipError);
-// end doxygen Error
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Stream Stream Management
- *  @{
- *  This section describes the stream management functions of HIP runtime API.
- *  The following Stream APIs are not (yet) supported in HIP:
- *  - hipStreamAttachMemAsync is a nop
- */
-/**
- * @brief Create an asynchronous stream.
- *
- * @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
- * newly created stream.
- * @return #hipSuccess, #hipErrorInvalidValue
- *
- * Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
- * reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
- * the heap and will remain allocated even if the handle goes out-of-scope.  To release the memory
- * used by the stream, applicaiton must call hipStreamDestroy.
- *
- * @return #hipSuccess, #hipErrorInvalidValue
- *
- * @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
- */
-hipError_t hipStreamCreate(hipStream_t* stream);
-/**
- * @brief Create an asynchronous stream.
- *
- * @param[in, out] stream Pointer to new stream
- * @param[in ] flags to control stream creation.
- * @return #hipSuccess, #hipErrorInvalidValue
- *
- * Create a new asynchronous stream.  @p stream returns an opaque handle that can be used to
- * reference the newly created stream in subsequent hipStream* commands.  The stream is allocated on
- * the heap and will remain allocated even if the handle goes out-of-scope.  To release the memory
- * used by the stream, applicaiton must call hipStreamDestroy. Flags controls behavior of the
- * stream.  See #hipStreamDefault, #hipStreamNonBlocking.
- *
- *
- * @see hipStreamCreate, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
- */
-hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
-/**
- * @brief Create an asynchronous stream with the specified priority.
- *
- * @param[in, out] stream Pointer to new stream
- * @param[in ] flags to control stream creation.
- * @param[in ] priority of the stream. Lower numbers represent higher priorities.
- * @return #hipSuccess, #hipErrorInvalidValue
- *
- * Create a new asynchronous stream with the specified priority.  @p stream returns an opaque handle
- * that can be used to reference the newly created stream in subsequent hipStream* commands.  The
- * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
- * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls
- * behavior of the stream.  See #hipStreamDefault, #hipStreamNonBlocking.
- *
- *
- * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
- */
-hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
-/**
- * @brief Returns numerical values that correspond to the least and greatest stream priority.
- *
- * @param[in, out] leastPriority pointer in which value corresponding to least priority is returned.
- * @param[in, out] greatestPriority pointer in which value corresponding to greatest priority is returned.
- *
- * Returns in *leastPriority and *greatestPriority the numerical values that correspond to the least
- * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers
- * imply greater priorities. The range of meaningful stream priorities is given by
- * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value
- * that is outside the the meaningful range as specified by this API, the priority is automatically
- * clamped to within the valid range.
- */
-hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
-/**
- * @brief Destroys the specified stream.
- *
- * @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
- * newly created stream.
- * @return #hipSuccess #hipErrorInvalidHandle
- *
- * Destroys the specified stream.
- *
- * If commands are still executing on the specified stream, some may complete execution before the
- * queue is deleted.
- *
- * The queue may be destroyed while some commands are still inflight, or may wait for all commands
- * queued to the stream before destroying it.
- *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamQuery, hipStreamWaitEvent,
- * hipStreamSynchronize
- */
-hipError_t hipStreamDestroy(hipStream_t stream);
-/**
- * @brief Return #hipSuccess if all of the operations in the specified @p stream have completed, or
- * #hipErrorNotReady if not.
- *
- * @param[in] stream stream to query
- *
- * @return #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
- *
- * This is thread-safe and returns a snapshot of the current state of the queue.  However, if other
- * host threads are sending work to the stream, the status may change immediately after the function
- * is called.  It is typically used for debug.
- *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamSynchronize,
- * hipStreamDestroy
- */
-hipError_t hipStreamQuery(hipStream_t stream);
-/**
- * @brief Wait for all commands in stream to complete.
- *
- * @param[in] stream stream identifier.
- *
- * @return #hipSuccess, #hipErrorInvalidHandle
- *
- * This command is host-synchronous : the host will block until the specified stream is empty.
- *
- * This command follows standard null-stream semantics.  Specifically, specifying the null stream
- * will cause the command to wait for other streams on the same device to complete all pending
- * operations.
- *
- * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active
- * or blocking.
- *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamDestroy
- *
- */
-hipError_t hipStreamSynchronize(hipStream_t stream);
-/**
- * @brief Make the specified compute stream wait for an event
- *
- * @param[in] stream stream to make wait.
- * @param[in] event event to wait on
- * @param[in] flags control operation [must be 0]
- *
- * @return #hipSuccess, #hipErrorInvalidHandle
- *
- * This function inserts a wait operation into the specified stream.
- * All future work submitted to @p stream will wait until @p event reports completion before
- * beginning execution.
- *
- * This function only waits for commands in the current stream to complete.  Notably,, this function
- * does not impliciy wait for commands in the default stream to complete, even if the specified
- * stream is created with hipStreamNonBlocking = 0.
- *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy
- */
-hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
-/**
- * @brief Return flags associated with this stream.
- *
- * @param[in] stream stream to be queried
- * @param[in,out] flags Pointer to an unsigned integer in which the stream's flags are returned
- * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
- *
- * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
- *
- * Return flags associated with this stream in *@p flags.
- *
- * @see hipStreamCreateWithFlags
- */
-hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags);
-/**
- * @brief Query the priority of a stream.
- *
- * @param[in] stream stream to be queried
- * @param[in,out] priority Pointer to an unsigned integer in which the stream's priority is returned
- * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
- *
- * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
- *
- * Query the priority of a stream. The priority is returned in in priority.
- *
- * @see hipStreamCreateWithFlags
- */
-hipError_t hipStreamGetPriority(hipStream_t stream, int* priority);
-/**
- * @brief Create an asynchronous stream with the specified CU mask.
- *
- * @param[in, out] stream Pointer to new stream
- * @param[in ] cuMaskSize Size of CU mask bit array passed in.
- * @param[in ] cuMask Bit-vector representing the CU mask. Each active bit represents using one CU.
- * The first 32 bits represent the first 32 CUs, and so on. If its size is greater than physical
- * CU number (i.e., multiProcessorCount member of hipDeviceProp_t), the extra elements are ignored.
- * It is user's responsibility to make sure the input is meaningful.
- * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
- *
- * Create a new asynchronous stream with the specified CU mask.  @p stream returns an opaque handle
- * that can be used to reference the newly created stream in subsequent hipStream* commands.  The
- * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
- * To release the memory used by the stream, application must call hipStreamDestroy.
- *
- *
- * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
- */
-hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize, const uint32_t* cuMask);
-/**
- * @brief Get CU mask associated with an asynchronous stream
- *
- * @param[in] stream stream to be queried
- * @param[in] cuMaskSize number of the block of memories (uint32_t *) allocated by user
- * @param[out] cuMask Pointer to a pre-allocated block of memories (uint32_t *) in which
- * the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where
- * each active bit represents one active CU
- * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
- *
- * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
- */
-hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask);
-/**
- * Stream CallBack struct
- */
-typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
-/**
- * @brief Adds a callback to be called on the host after all currently enqueued
- * items in the stream have completed.  For each
- * hipStreamAddCallback call, a callback will be executed exactly once.
- * The callback will block later work in the stream until it is finished.
- * @param[in] stream   - Stream to add callback to
- * @param[in] callback - The function to call once preceding stream operations are complete
- * @param[in] userData - User specified data to be passed to the callback function
- * @param[in] flags    - Reserved for future use, must be 0
- * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
- *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
- * hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
- *
- */
-hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
-                                unsigned int flags);
-// end doxygen Stream
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Stream Memory Operations
- *  @{
- *  This section describes Stream Memory Wait and Write functions of HIP runtime API.
- */
-/**
- * @brief Enqueues a wait command to the stream.
- *
- * @param [in] stream - Stream identifier
- * @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
- * @param [in] value  - Value to be used in compare operation
- * @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte
- * hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor
- * @param [in] mask   - Mask to be applied on value at memory before it is compared with value,
- * default value is set to enable every bit
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- *
- * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
- * not execute until the defined wait condition is true.
- *
- * hipStreamWaitValueGte: waits until *ptr&mask >= value
- * hipStreamWaitValueEq : waits until *ptr&mask == value
- * hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
- * hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
- *
- * @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.
- *
- * @note Support for hipStreamWaitValue32 can be queried using 'hipDeviceGetAttribute()' and
- * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
- *
- * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue64, hipStreamWriteValue64,
- * hipStreamWriteValue32, hipDeviceGetAttribute
- */
-hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags,
-                                uint32_t mask __dparm(0xFFFFFFFF));
-/**
- * @brief Enqueues a wait command to the stream.
- *
- * @param [in] stream - Stream identifier
- * @param [in] ptr    - Pointer to memory object allocated using 'hipMallocSignalMemory' flag
- * @param [in] value  - Value to be used in compare operation
- * @param [in] flags  - Defines the compare operation, supported values are hipStreamWaitValueGte
- * hipStreamWaitValueEq, hipStreamWaitValueAnd and hipStreamWaitValueNor.
- * @param [in] mask   - Mask to be applied on value at memory before it is compared with value
- * default value is set to enable every bit
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- *
- * Enqueues a wait command to the stream, all operations enqueued  on this stream after this, will
- * not execute until the defined wait condition is true.
- *
- * hipStreamWaitValueGte: waits until *ptr&mask >= value
- * hipStreamWaitValueEq : waits until *ptr&mask == value
- * hipStreamWaitValueAnd: waits until ((*ptr&mask) & value) != 0
- * hipStreamWaitValueNor: waits until ~((*ptr&mask) | (value&mask)) != 0
- *
- * @note when using 'hipStreamWaitValueNor', mask is applied on both 'value' and '*ptr'.
- *
- * @note Support for hipStreamWaitValue64 can be queried using 'hipDeviceGetAttribute()' and
- * 'hipDeviceAttributeCanUseStreamWaitValue' flag.
- *
- * @see hipExtMallocWithFlags, hipFree, hipStreamWaitValue32, hipStreamWriteValue64,
- * hipStreamWriteValue32, hipDeviceGetAttribute
- */
-hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags,
-                                uint64_t mask __dparm(0xFFFFFFFFFFFFFFFF));
-/**
- * @brief Enqueues a write command to the stream.
- *
- * @param [in] stream - Stream identifier
- * @param [in] ptr    - Pointer to a GPU accessible memory object
- * @param [in] value  - Value to be written
- * @param [in] flags  - reserved, ignored for now, will be used in future releases
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- *
- * Enqueues a write command to the stream, write operation is performed after all earlier commands
- * on this stream have completed the execution.
- *
- * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
- * hipStreamWaitValue64
- */
-hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags);
-/**
- * @brief Enqueues a write command to the stream.
- *
- * @param [in] stream - Stream identifier
- * @param [in] ptr    - Pointer to a GPU accessible memory object
- * @param [in] value  - Value to be written
- * @param [in] flags  - reserved, ignored for now, will be used in future releases
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- *
- * Enqueues a write command to the stream, write operation is performed after all earlier commands
- * on this stream have completed the execution.
- *
- * @see hipExtMallocWithFlags, hipFree, hipStreamWriteValue32, hipStreamWaitValue32,
- * hipStreamWaitValue64
- */
-hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags);
-// end doxygen Stream Memory Operations
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Event Event Management
- *  @{
- *  This section describes the event management functions of HIP runtime API.
- */
-/**
- * @brief Create an event with the specified flags
- *
- * @param[in,out] event Returns the newly created event.
- * @param[in] flags     Flags to control event behavior.  Valid values are #hipEventDefault,
- #hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess
- * #hipEventDefault : Default flag.  The event will use active synchronization and will support
- timing.  Blocking synchronization provides lowest possible latency at the expense of dedicating a
- CPU to poll on the event.
- * #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is
- called on this event, the thread will block until the event completes.  This can increase latency
- for the synchroniation but can result in lower power and more resources for other CPU threads.
- * #hipEventDisableTiming : Disable recording of timing information. Events created with this flag
- would not record profiling data and provide best performance if used for synchronization.
- * @warning On AMD platform, hipEventInterprocess support is under development.  Use of this flag
- will return an error.
- *
- * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
- #hipErrorLaunchFailure, #hipErrorOutOfMemory
- *
- * @see hipEventCreate, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
- */
-hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
-/**
- *  Create an event
- *
- * @param[in,out] event Returns the newly created event.
- *
- * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
- * #hipErrorLaunchFailure, #hipErrorOutOfMemory
- *
- * @see hipEventCreateWithFlags, hipEventRecord, hipEventQuery, hipEventSynchronize,
- * hipEventDestroy, hipEventElapsedTime
- */
-hipError_t hipEventCreate(hipEvent_t* event);
-/**
- * @brief Record an event in the specified stream.
- *
- * @param[in] event event to record.
- * @param[in] stream stream in which to record event.
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
- * #hipErrorInvalidHandle, #hipErrorLaunchFailure
- *
- * hipEventQuery() or hipEventSynchronize() must be used to determine when the event
- * transitions from "recording" (after hipEventRecord() is called) to "recorded"
- * (when timestamps are set, if requested).
- *
- * Events which are recorded in a non-NULL stream will transition to
- * from recording to "recorded" state when they reach the head of
- * the specified stream, after all previous
- * commands in that stream have completed executing.
- *
- * If hipEventRecord() has been previously called on this event, then this call will overwrite any
- * existing state in event.
- *
- * If this function is called on an event that is currently being recorded, results are undefined
- * - either outstanding recording may save state into the event, and the order is not guaranteed.
- *
- * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
- * hipEventDestroy, hipEventElapsedTime
- *
- */
-#ifdef __cplusplus
-hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
-#else
-hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
-#endif
-/**
- *  @brief Destroy the specified event.
- *
- *  @param[in] event Event to destroy.
- *  @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
- * #hipErrorLaunchFailure
- *
- *  Releases memory associated with the event.  If the event is recording but has not completed
- * recording when hipEventDestroy() is called, the function will return immediately and the
- * completion_future resources will be released later, when the hipDevice is synchronized.
- *
- * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventRecord,
- * hipEventElapsedTime
- *
- * @returns #hipSuccess
- */
-hipError_t hipEventDestroy(hipEvent_t event);
-/**
- *  @brief Wait for an event to complete.
- *
- *  This function will block until the event is ready, waiting for all previous work in the stream
- * specified when event was recorded with hipEventRecord().
- *
- *  If hipEventRecord() has not been called on @p event, this function returns immediately.
- *
- *  TODO-hip- This function needs to support hipEventBlockingSync parameter.
- *
- *  @param[in] event Event on which to wait.
- *  @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
- * #hipErrorInvalidHandle, #hipErrorLaunchFailure
- *
- *  @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
- * hipEventElapsedTime
- */
-hipError_t hipEventSynchronize(hipEvent_t event);
-/**
- * @brief Return the elapsed time between two events.
- *
- * @param[out] ms : Return time between start and stop in ms.
- * @param[in]   start : Start event.
- * @param[in]   stop  : Stop event.
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotReady, #hipErrorInvalidHandle,
- * #hipErrorNotInitialized, #hipErrorLaunchFailure
- *
- * Computes the elapsed time between two events. Time is computed in ms, with
- * a resolution of approximately 1 us.
- *
- * Events which are recorded in a NULL stream will block until all commands
- * on all other streams complete execution, and then record the timestamp.
- *
- * Events which are recorded in a non-NULL stream will record their timestamp
- * when they reach the head of the specified stream, after all previous
- * commands in that stream have completed executing.  Thus the time that
- * the event recorded may be significantly after the host calls hipEventRecord().
- *
- * If hipEventRecord() has not been called on either event, then #hipErrorInvalidHandle is
- * returned. If hipEventRecord() has been called on both events, but the timestamp has not yet been
- * recorded on one or both events (that is, hipEventQuery() would return #hipErrorNotReady on at
- * least one of the events), then #hipErrorNotReady is returned.
- *
- * Note, for HIP Events used in kernel dispatch using hipExtLaunchKernelGGL/hipExtLaunchKernel,
- * events passed in hipExtLaunchKernelGGL/hipExtLaunchKernel are not explicitly recorded and should
- * only be used to get elapsed time for that specific launch. In case events are used across
- * multiple dispatches, for example, start and stop events from different hipExtLaunchKernelGGL/
- * hipExtLaunchKernel calls, they will be treated as invalid unrecorded events, HIP will throw
- * error "hipErrorInvalidHandle" from hipEventElapsedTime.
- *
- * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
- * hipEventSynchronize
- */
-hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop);
-/**
- * @brief Query event status
- *
- * @param[in] event Event to query.
- * @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle, #hipErrorInvalidValue,
- * #hipErrorNotInitialized, #hipErrorLaunchFailure
- *
- * Query the status of the specified event.  This function will return #hipErrorNotReady if all
- * commands in the appropriate stream (specified to hipEventRecord()) have completed.  If that work
- * has not completed, or if hipEventRecord() was not called on the event, then #hipSuccess is
- * returned.
- *
- * @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
- * hipEventSynchronize, hipEventElapsedTime
- */
-hipError_t hipEventQuery(hipEvent_t event);
-// end doxygen Events
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Memory Memory Management
- *  @{
- *  This section describes the memory management functions of HIP runtime API.
- *  The following CUDA APIs are not currently supported:
- *  - cudaMalloc3D
- *  - cudaMalloc3DArray
- *  - TODO - more 2D, 3D, array APIs here.
- *
- *
- */
-/**
- *  @brief Return attributes for the specified pointer
- *
- *  @param[out] attributes for the specified pointer
- *  @param[in]  pointer to get attributes for
- *
- *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- *  @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice
- */
-hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr);
-
-/**
- *  @brief Imports an external semaphore.
- *
- *  @param[out] extSem_out  External semaphores to be waited on
- *  @param[in] semHandleDesc Semaphore import handle descriptor
- *
- *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- *  @see
- */
-hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
-                                      const hipExternalSemaphoreHandleDesc* semHandleDesc);
-/**
- *  @brief Signals a set of external semaphore objects.
- *
- *  @param[in] extSem_out  External semaphores to be waited on
- *  @param[in] paramsArray Array of semaphore parameters
- *  @param[in] numExtSems Number of semaphores to wait on
- *  @param[in] stream Stream to enqueue the wait operations in
- *
- *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- *  @see
- */
-hipError_t hipSignalExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
-                                            const hipExternalSemaphoreSignalParams* paramsArray,
-                                            unsigned int numExtSems, hipStream_t stream);
-/**
- *  @brief Waits on a set of external semaphore objects
- *
- *  @param[in] extSem_out  External semaphores to be waited on
- *  @param[in] paramsArray Array of semaphore parameters
- *  @param[in] numExtSems Number of semaphores to wait on
- *  @param[in] stream Stream to enqueue the wait operations in
- *
- *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- *  @see
- */
-hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
-                                              const hipExternalSemaphoreWaitParams* paramsArray,
-                                              unsigned int numExtSems, hipStream_t stream);
-/**
- *  @brief Destroys an external semaphore object and releases any references to the underlying resource. Any outstanding signals or waits must have completed before the semaphore is destroyed.
- *
- *  @param[in] extSem handle to an external memory object
- *
- *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- *
- *  @see
- */
-hipError_t hipDestroyExternalSemaphore(hipExternalSemaphore_t extSem);
-
-/**
-*  @brief Imports an external memory object.
-*
-*  @param[out] extMem_out  Returned handle to an external memory object
-*  @param[in]  memHandleDesc Memory import handle descriptor
-*
-*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
-*
-*  @see
-*/
-hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out, const hipExternalMemoryHandleDesc* memHandleDesc);
-/**
-*  @brief Maps a buffer onto an imported memory object.
-*
-*  @param[out] devPtr Returned device pointer to buffer
-*  @param[in]  extMem  Handle to external memory object
-*  @param[in]  bufferDesc  Buffer descriptor
-*
-*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
-*
-*  @see
-*/
-hipError_t hipExternalMemoryGetMappedBuffer(void **devPtr, hipExternalMemory_t extMem, const hipExternalMemoryBufferDesc *bufferDesc);
-/**
-*  @brief Destroys an external memory object.
-*
-*  @param[in] extMem  External memory object to be destroyed
-*
-*  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
-*
-*  @see
-*/
-hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem);
-/**
- *  @brief Allocate memory on the default accelerator
- *
- *  @param[out] ptr Pointer to the allocated memory
- *  @param[in]  size Requested memory size
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
- *
- *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
- * hipHostFree, hipHostMalloc
- */
-hipError_t hipMalloc(void** ptr, size_t size);
-/**
- *  @brief Allocate memory on the default accelerator
- *
- *  @param[out] ptr Pointer to the allocated memory
- *  @param[in]  size Requested memory size
- *  @param[in]  flags Type of memory allocation
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
- *
- *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
- * hipHostFree, hipHostMalloc
- */
-hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags);
-/**
- *  @brief Allocate pinned host memory [Deprecated]
- *
- *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory
- *
- *  @deprecated use hipHostMalloc() instead
- */
-DEPRECATED("use hipHostMalloc instead")
-hipError_t hipMallocHost(void** ptr, size_t size);
-/**
- *  @brief Allocate pinned host memory [Deprecated]
- *
- *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory
- *
- *  @deprecated use hipHostMalloc() instead
- */
-DEPRECATED("use hipHostMalloc instead")
-hipError_t hipMemAllocHost(void** ptr, size_t size);
-/**
- *  @brief Allocate device accessible page locked host memory
- *
- *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
- *  @param[in]  flags Type of host memory allocation
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory
- *
- *  @see hipSetDeviceFlags, hipHostFree
- */
-hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags);
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @addtogroup Memory Managed Memory
- *  @{
- *  @ingroup Memory
- *  This section describes the managed memory management functions of HIP runtime API.
- *
- */
-/**
- * @brief Allocates memory that will be automatically managed by HIP.
- *
- * @param [out] dev_ptr - pointer to allocated device memory
- * @param [in]  size    - requested allocation size in bytes
- * @param [in]  flags   - must be either hipMemAttachGlobal or hipMemAttachHost
- *                        (defaults to hipMemAttachGlobal)
- *
- * @returns #hipSuccess, #hipErrorMemoryAllocation, #hipErrorNotSupported, #hipErrorInvalidValue
- */
-hipError_t hipMallocManaged(void** dev_ptr,
-                            size_t size,
-                            unsigned int flags __dparm(hipMemAttachGlobal));
-/**
- * @brief Prefetches memory to the specified destination device using HIP.
- *
- * @param [in] dev_ptr  pointer to be prefetched
- * @param [in] count    size in bytes for prefetching
- * @param [in] device   destination device to prefetch to
- * @param [in] stream   stream to enqueue prefetch operation
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipMemPrefetchAsync(const void* dev_ptr,
-                               size_t count,
-                               int device,
-                               hipStream_t stream __dparm(0));
-/**
- * @brief Advise about the usage of a given memory range to HIP.
- *
- * @param [in] dev_ptr  pointer to memory to set the advice for
- * @param [in] count    size in bytes of the memory range
- * @param [in] advice   advice to be applied for the specified memory range
- * @param [in] device   device to apply the advice for
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipMemAdvise(const void* dev_ptr,
-                        size_t count,
-                        hipMemoryAdvise advice,
-                        int device);
-/**
- * @brief Query an attribute of a given memory range in HIP.
- *
- * @param [in/out] data   a pointer to a memory location where the result of each
- *                        attribute query will be written to
- * @param [in] data_size  the size of data
- * @param [in] attribute  the attribute to query
- * @param [in] dev_ptr    start of the range to query
- * @param [in] count      size of the range to query
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipMemRangeGetAttribute(void* data,
-                                   size_t data_size,
-                                   hipMemRangeAttribute attribute,
-                                   const void* dev_ptr,
-                                   size_t count);
-/**
- * @brief Query attributes of a given memory range in HIP.
- *
- * @param [in/out] data     a two-dimensional array containing pointers to memory locations
- *                          where the result of each attribute query will be written to
- * @param [in] data_sizes   an array, containing the sizes of each result
- * @param [in] attributes   the attribute to query
- * @param [in] num_attributes  an array of attributes to query (numAttributes and the number
- *                          of attributes in this array should match)
- * @param [in] dev_ptr      start of the range to query
- * @param [in] count        size of the range to query
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipMemRangeGetAttributes(void** data,
-                                    size_t* data_sizes,
-                                    hipMemRangeAttribute* attributes,
-                                    size_t num_attributes,
-                                    const void* dev_ptr,
-                                    size_t count);
-/**
- * @brief Attach memory to a stream asynchronously in HIP.
- *
- * @param [in] stream     - stream in which to enqueue the attach operation
- * @param [in] dev_ptr    - pointer to memory (must be a pointer to managed memory or
- *                          to a valid host-accessible region of system-allocated memory)
- * @param [in] length     - length of memory (defaults to zero)
- * @param [in] flags      - must be one of hipMemAttachGlobal, hipMemAttachHost or
- *                          hipMemAttachSingle (defaults to hipMemAttachSingle)
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipStreamAttachMemAsync(hipStream_t stream,
-                                   hipDeviceptr_t* dev_ptr,
-                                   size_t length __dparm(0),
-                                   unsigned int flags __dparm(hipMemAttachSingle));
-// end doxygen Managed Memory
-/**
- * @}
- */
-/**
- *  @brief Allocate device accessible page locked host memory [Deprecated]
- *
- *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
- *  @param[in]  flags Type of host memory allocation
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory
- *
- *  @deprecated use hipHostMalloc() instead
- */
-DEPRECATED("use hipHostMalloc instead")
-hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags);
-/**
- *  @brief Get Device pointer from Host Pointer allocated through hipHostMalloc
- *
- *  @param[out] dstPtr Device Pointer mapped to passed host pointer
- *  @param[in]  hstPtr Host Pointer allocated through hipHostMalloc
- *  @param[in]  flags Flags to be passed for extension
- *
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
- *
- *  @see hipSetDeviceFlags, hipHostMalloc
- */
-hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int flags);
-/**
- *  @brief Return flags associated with host pointer
- *
- *  @param[out] flagsPtr Memory location to store flags
- *  @param[in]  hostPtr Host Pointer allocated through hipHostMalloc
- *  @return #hipSuccess, #hipErrorInvalidValue
- *
- *  @see hipHostMalloc
- */
-hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr);
-/**
- *  @brief Register host memory so it can be accessed from the current device.
- *
- *  @param[out] hostPtr Pointer to host memory to be registered.
- *  @param[in] sizeBytes size of the host memory
- *  @param[in] flags.  See below.
- *
- *  Flags:
- *  - #hipHostRegisterDefault   Memory is Mapped and Portable
- *  - #hipHostRegisterPortable  Memory is considered registered by all contexts.  HIP only supports
- * one context so this is always assumed true.
- *  - #hipHostRegisterMapped    Map the allocation into the address space for the current device.
- * The device pointer can be obtained with #hipHostGetDevicePointer.
- *
- *
- *  After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
- *  On many systems, the mapped device pointer will have a different value than the mapped host
- * pointer.  Applications must use the device pointer in device code, and the host pointer in device
- * code.
- *
- *  On some systems, registered memory is pinned.  On some systems, registered memory may not be
- * actually be pinned but uses OS or hardware facilities to all GPU access to the host memory.
- *
- *  Developers are strongly encouraged to register memory blocks which are aligned to the host
- * cache-line size. (typically 64-bytes but can be obtains from the CPUID instruction).
- *
- *  If registering non-aligned pointers, the application must take care when register pointers from
- * the same cache line on different devices.  HIP's coarse-grained synchronization model does not
- * guarantee correct results if different devices write to different parts of the same cache block -
- * typically one of the writes will "win" and overwrite data from the other registered memory
- * region.
- *
- *  @return #hipSuccess, #hipErrorOutOfMemory
- *
- *  @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer
- */
-hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
-/**
- *  @brief Un-register host pointer
- *
- *  @param[in] hostPtr Host pointer previously registered with #hipHostRegister
- *  @return Error code
- *
- *  @see hipHostRegister
- */
-hipError_t hipHostUnregister(void* hostPtr);
-/**
- *  Allocates at least width (in bytes) * height bytes of linear memory
- *  Padding may occur to ensure alighnment requirements are met for the given row
- *  The change in width size due to padding will be returned in *pitch.
- *  Currently the alignment is set to 128 bytes
- *
- *  @param[out] ptr Pointer to the allocated device memory
- *  @param[out] pitch Pitch for allocation (in bytes)
- *  @param[in]  width Requested pitched allocation width (in bytes)
- *  @param[in]  height Requested pitched allocation height
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *
- *  @return Error code
- *
- *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
- * hipMalloc3DArray, hipHostMalloc
- */
-hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height);
-/**
- *  Allocates at least width (in bytes) * height bytes of linear memory
- *  Padding may occur to ensure alighnment requirements are met for the given row
- *  The change in width size due to padding will be returned in *pitch.
- *  Currently the alignment is set to 128 bytes
- *
- *  @param[out] dptr Pointer to the allocated device memory
- *  @param[out] pitch Pitch for allocation (in bytes)
- *  @param[in]  width Requested pitched allocation width (in bytes)
- *  @param[in]  height Requested pitched allocation height
- *
- *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
- *  The intended usage of pitch is as a separate parameter of the allocation, used to compute addresses within the 2D array.
- *  Given the row and column of an array element of type T, the address is computed as:
- *  T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
- *
- *  @return Error code
- *
- *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
- * hipMalloc3DArray, hipHostMalloc
- */
-hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes, size_t height, unsigned int elementSizeBytes);
-/**
- *  @brief Free memory allocated by the hcc hip memory allocation API.
- *  This API performs an implicit hipDeviceSynchronize() call.
- *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
- *
- *  @param[in] ptr Pointer to memory to be freed
- *  @return #hipSuccess
- *  @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
- * with hipHostMalloc)
- *
- *  @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
- * hipMalloc3DArray, hipHostMalloc
- */
-hipError_t hipFree(void* ptr);
-/**
- *  @brief Free memory allocated by the hcc hip host memory allocation API.  [Deprecated]
- *
- *  @param[in] ptr Pointer to memory to be freed
- *  @return #hipSuccess,
- *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
- hipMalloc)
- *  @deprecated use hipHostFree() instead
- */
-DEPRECATED("use hipHostFree instead")
-hipError_t hipFreeHost(void* ptr);
-/**
- *  @brief Free memory allocated by the hcc hip host memory allocation API
- *  This API performs an implicit hipDeviceSynchronize() call.
- *  If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
- *
- *  @param[in] ptr Pointer to memory to be freed
- *  @return #hipSuccess,
- *          #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
- * hipMalloc)
- *
- *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D,
- * hipMalloc3DArray, hipHostMalloc
- */
-hipError_t hipHostFree(void* ptr);
-/**
- *  @brief Copy data from src to dst.
- *
- *  It supports memory from host to device,
- *  device to host, device to device and host to host
- *  The src and dst must not overlap.
- *
- *  For hipMemcpy, the copy is always performed by the current device (set by hipSetDevice).
- *  For multi-gpu or peer-to-peer configurations, it is recommended to set the current device to the
- *  device where the src data is physically located. For optimal peer-to-peer copies, the copy device
- *  must be able to access the src and dst pointers (by calling hipDeviceEnablePeerAccess with copy
- *  agent as the current device and src/dest as the peerDevice argument.  if this is not done, the
- *  hipMemcpy will still work, but will perform the copy using a staging buffer on the host.
- *  Calling hipMemcpy with dst and src pointers that do not match the hipMemcpyKind results in
- *  undefined behavior.
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]  src Data being copy from
- *  @param[in]  sizeBytes Data size in bytes
- *  @param[in]  copyType Memory copy type
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknowni
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
-// TODO: Add description
-hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
-                               hipMemcpyKind kind, hipStream_t stream);
-/**
- *  @brief Copy data from Host to Device
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
-/**
- *  @brief Copy data from Device to Host
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
-/**
- *  @brief Copy data from Device to Device
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
-/**
- *  @brief Copy data from Host to Device asynchronously
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream);
-/**
- *  @brief Copy data from Device to Host asynchronously
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream);
-/**
- *  @brief Copy data from Device to Device asynchronously
- *
- *  @param[out]  dst Data being copy to
- *  @param[in]   src Data being copy from
- *  @param[in]   sizeBytes Data size in bytes
- *
- *  @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
- * #hipErrorInvalidValue
- *
- *  @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
- * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
- * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
- * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
- * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
- * hipMemHostAlloc, hipMemHostGetDevicePointer
- */
-hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
-                              hipStream_t stream);
-hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
-    hipModule_t hmod, const char* name);
-hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
-hipError_t hipGetSymbolSize(size_t* size, const void* symbol);
-hipError_t hipMemcpyToSymbol(const void* symbol, const void* src,
-                             size_t sizeBytes, size_t offset __dparm(0),
-                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
-hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
-                                  size_t sizeBytes, size_t offset,
-                                  hipMemcpyKind kind, hipStream_t stream __dparm(0));
-hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol,
-                               size_t sizeBytes, size_t offset __dparm(0),
-                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
-hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol,
-                                    size_t sizeBytes, size_t offset,
-                                    hipMemcpyKind kind,
-                                    hipStream_t stream __dparm(0));
-/**
- *  @brief Copy data from src to dst asynchronously.
- *
- *  @warning If host or dest are not pinned, the memory copy will be performed synchronously.  For
- * best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
- *
- *  @warning on HCC hipMemcpyAsync does not support overlapped H2D and D2H copies.
- *  For hipMemcpy, the copy is always performed by the device associated with the specified stream.
- *
- *  For multi-gpu or peer-to-peer configurations, it is recommended to use a stream which is a
- * attached to the device where the src data is physically located. For optimal peer-to-peer copies,
- * the copy device must be able to access the src and dst pointers (by calling
- * hipDeviceEnablePeerAccess with copy agent as the current device and src/dest as the peerDevice
- * argument.  if this is not done, the hipMemcpy will still work, but will perform the copy using a
- * staging buffer on the host.
- *
- *  @param[out] dst Data being copy to
- *  @param[in]  src Data being copy from
- *  @param[in]  sizeBytes Data size in bytes
- *  @param[in]  accelerator_view Accelerator view which the copy is being enqueued
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
- *
- *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
- * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyToSymbol,
- * hipMemcpyFromSymbol, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
- * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
- * hipMemcpyFromSymbolAsync
- */
-hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
-                          hipStream_t stream __dparm(0));
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
- * byte value value.
- *
- *  @param[out] dst Data being filled
- *  @param[in]  constant value to be set
- *  @param[in]  sizeBytes Data size in bytes
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
- * byte value value.
- *
- *  @param[out] dst Data ptr to be filled
- *  @param[in]  constant value to be set
- *  @param[in]  number of values to be set
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
- * byte value value.
- *
- * hipMemsetD8Async() is asynchronous with respect to the host, so the call may return before the
- * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
- * stream argument. If stream is non-zero, the operation may overlap with operations in other
- * streams.
- *
- *  @param[out] dst Data ptr to be filled
- *  @param[in]  constant value to be set
- *  @param[in]  number of values to be set
- *  @param[in]  stream - Stream identifier
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
- * short value value.
- *
- *  @param[out] dst Data ptr to be filled
- *  @param[in]  constant value to be set
- *  @param[in]  number of values to be set
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
- * short value value.
- *
- * hipMemsetD16Async() is asynchronous with respect to the host, so the call may return before the
- * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
- * stream argument. If stream is non-zero, the operation may overlap with operations in other
- * streams.
- *
- *  @param[out] dst Data ptr to be filled
- *  @param[in]  constant value to be set
- *  @param[in]  number of values to be set
- *  @param[in]  stream - Stream identifier
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
-/**
- *  @brief Fills the memory area pointed to by dest with the constant integer
- * value for specified number of times.
- *
- *  @param[out] dst Data being filled
- *  @param[in]  constant value to be set
- *  @param[in]  number of values to be set
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- */
-hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
-/**
- *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
- * byte value value.
- *
- *  hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
- * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
- * stream argument. If stream is non-zero, the operation may overlap with operations in other
- * streams.
- *
- *  @param[out] dst Pointer to device memory
- *  @param[in]  value - Value to set for each byte of specified memory
- *  @param[in]  sizeBytes - Size in bytes to set
- *  @param[in]  stream - Stream identifier
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream __dparm(0));
-/**
- *  @brief Fills the memory area pointed to by dev with the constant integer
- * value for specified number of times.
- *
- *  hipMemsetD32Async() is asynchronous with respect to the host, so the call may return before the
- * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
- * stream argument. If stream is non-zero, the operation may overlap with operations in other
- * streams.
- *
- *  @param[out] dst Pointer to device memory
- *  @param[in]  value - Value to set for each byte of specified memory
- *  @param[in]  count - number of values to be set
- *  @param[in]  stream - Stream identifier
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
-                             hipStream_t stream __dparm(0));
-/**
- *  @brief Fills the memory area pointed to by dst with the constant value.
- *
- *  @param[out] dst Pointer to device memory
- *  @param[in]  pitch - data size in bytes
- *  @param[in]  value - constant value to be set
- *  @param[in]  width
- *  @param[in]  height
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height);
-/**
- *  @brief Fills asynchronously the memory area pointed to by dst with the constant value.
- *
- *  @param[in]  dst Pointer to device memory
- *  @param[in]  pitch - data size in bytes
- *  @param[in]  value - constant value to be set
- *  @param[in]  width
- *  @param[in]  height
- *  @param[in]  stream
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height,hipStream_t stream __dparm(0));
-/**
- *  @brief Fills synchronously the memory area pointed to by pitchedDevPtr with the constant value.
- *
- *  @param[in] pitchedDevPtr
- *  @param[in]  value - constant value to be set
- *  @param[in]  extent
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent );
-/**
- *  @brief Fills asynchronously the memory area pointed to by pitchedDevPtr with the constant value.
- *
- *  @param[in] pitchedDevPtr
- *  @param[in]  value - constant value to be set
- *  @param[in]  extent
- *  @param[in]  stream
- *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
- */
-hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ,hipStream_t stream __dparm(0));
-/**
- * @brief Query memory info.
- * Return snapshot of free memory, and total allocatable memory on the device.
- *
- * Returns in *free a snapshot of the current free memory.
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- * @warning On HCC, the free memory only accounts for memory allocated by this process and may be
- *optimistic.
- **/
-hipError_t hipMemGetInfo(size_t* free, size_t* total);
-hipError_t hipMemPtrGetInfo(void* ptr, size_t* size);
-/**
- *  @brief Allocate an array on the device.
- *
- *  @param[out]  array  Pointer to allocated array in device memory
- *  @param[in]   desc   Requested channel format
- *  @param[in]   width  Requested array allocation width
- *  @param[in]   height Requested array allocation height
- *  @param[in]   flags  Requested properties of allocated array
- *  @return      #hipSuccess, #hipErrorOutOfMemory
- *
- *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
- */
-hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, size_t width,
-                          size_t height __dparm(0), unsigned int flags __dparm(hipArrayDefault));
-hipError_t hipArrayCreate(hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray);
-hipError_t hipArrayDestroy(hipArray* array);
-hipError_t hipArray3DCreate(hipArray** array, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray);
-hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent);
-/**
- *  @brief Frees an array on the device.
- *
- *  @param[in]  array  Pointer to array to free
- *  @return     #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
- *
- *  @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree
- */
-hipError_t hipFreeArray(hipArray* array);
-/**
- * @brief Frees a mipmapped array on the device
- *
- * @param[in] mipmappedArray - Pointer to mipmapped array to free
- *
- * @return #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
-/**
- *  @brief Allocate an array on the device.
- *
- *  @param[out]  array  Pointer to allocated array in device memory
- *  @param[in]   desc   Requested channel format
- *  @param[in]   extent Requested array allocation width, height and depth
- *  @param[in]   flags  Requested properties of allocated array
- *  @return      #hipSuccess, #hipErrorOutOfMemory
- *
- *  @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
- */
-hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc* desc,
-                            struct hipExtent extent, unsigned int flags);
-/**
- * @brief Allocate a mipmapped array on the device
- *
- * @param[out] mipmappedArray  - Pointer to allocated mipmapped array in device memory
- * @param[in]  desc            - Requested channel format
- * @param[in]  extent          - Requested allocation size (width field in elements)
- * @param[in]  numLevels       - Number of mipmap levels to allocate
- * @param[in]  flags           - Flags for extensions
- *
- * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
- */
-hipError_t hipMallocMipmappedArray(
-    hipMipmappedArray_t *mipmappedArray,
-    const struct hipChannelFormatDesc* desc,
-    struct hipExtent extent,
-    unsigned int numLevels,
-    unsigned int flags __dparm(0));
-/**
- * @brief Gets a mipmap level of a HIP mipmapped array
- *
- * @param[out] levelArray     - Returned mipmap level HIP array
- * @param[in]  mipmappedArray - HIP mipmapped array
- * @param[in]  level          - Mipmap level
- *
- * @return #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipGetMipmappedArrayLevel(
-    hipArray_t *levelArray,
-    hipMipmappedArray_const_t mipmappedArray,
-    unsigned int level);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst    Destination memory address
- *  @param[in]   dpitch Pitch of destination memory
- *  @param[in]   src    Source memory address
- *  @param[in]   spitch Pitch of source memory
- *  @param[in]   width  Width of matrix transfer (columns in bytes)
- *  @param[in]   height Height of matrix transfer (rows)
- *  @param[in]   kind   Type of transfer
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
-                       size_t height, hipMemcpyKind kind);
-/**
- *  @brief Copies memory for 2D arrays.
- *  @param[in]   pCopy Parameters for the memory copy
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
- * hipMemcpyToSymbol, hipMemcpyAsync
-*/
-hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
-/**
- *  @brief Copies memory for 2D arrays.
- *  @param[in]   pCopy Parameters for the memory copy
- *  @param[in]   stream Stream to use
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
- * hipMemcpyToSymbol, hipMemcpyAsync
-*/
-hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0));
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst    Destination memory address
- *  @param[in]   dpitch Pitch of destination memory
- *  @param[in]   src    Source memory address
- *  @param[in]   spitch Pitch of source memory
- *  @param[in]   width  Width of matrix transfer (columns in bytes)
- *  @param[in]   height Height of matrix transfer (rows)
- *  @param[in]   kind   Type of transfer
- *  @param[in]   stream Stream to use
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
-                            size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst     Destination memory address
- *  @param[in]   wOffset Destination starting X offset
- *  @param[in]   hOffset Destination starting Y offset
- *  @param[in]   src     Source memory address
- *  @param[in]   spitch  Pitch of source memory
- *  @param[in]   width   Width of matrix transfer (columns in bytes)
- *  @param[in]   height  Height of matrix transfer (rows)
- *  @param[in]   kind    Type of transfer
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
-                              size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst     Destination memory address
- *  @param[in]   wOffset Destination starting X offset
- *  @param[in]   hOffset Destination starting Y offset
- *  @param[in]   src     Source memory address
- *  @param[in]   spitch  Pitch of source memory
- *  @param[in]   width   Width of matrix transfer (columns in bytes)
- *  @param[in]   height  Height of matrix transfer (rows)
- *  @param[in]   kind    Type of transfer
- *  @param[in]   stream    Accelerator view which the copy is being enqueued
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
-                                   size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
-                                   hipStream_t stream __dparm(0));
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst     Destination memory address
- *  @param[in]   wOffset Destination starting X offset
- *  @param[in]   hOffset Destination starting Y offset
- *  @param[in]   src     Source memory address
- *  @param[in]   count   size in bytes to copy
- *  @param[in]   kind    Type of transfer
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
-                            size_t count, hipMemcpyKind kind);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst       Destination memory address
- *  @param[in]   srcArray  Source memory address
- *  @param[in]   woffset   Source starting X offset
- *  @param[in]   hOffset   Source starting Y offset
- *  @param[in]   count     Size in bytes to copy
- *  @param[in]   kind      Type of transfer
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
-                              size_t count, hipMemcpyKind kind);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst       Destination memory address
- *  @param[in]   dpitch    Pitch of destination memory
- *  @param[in]   src       Source memory address
- *  @param[in]   wOffset   Source starting X offset
- *  @param[in]   hOffset   Source starting Y offset
- *  @param[in]   width     Width of matrix transfer (columns in bytes)
- *  @param[in]   height    Height of matrix transfer (rows)
- *  @param[in]   kind      Type of transfer
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
-/**
- *  @brief Copies data between host and device asynchronously.
- *
- *  @param[in]   dst       Destination memory address
- *  @param[in]   dpitch    Pitch of destination memory
- *  @param[in]   src       Source memory address
- *  @param[in]   wOffset   Source starting X offset
- *  @param[in]   hOffset   Source starting Y offset
- *  @param[in]   width     Width of matrix transfer (columns in bytes)
- *  @param[in]   height    Height of matrix transfer (rows)
- *  @param[in]   kind      Type of transfer
- *  @param[in]   stream    Accelerator view which the copy is being enqueued
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dst       Destination memory address
- *  @param[in]   srcArray  Source array
- *  @param[in]   srcoffset Offset in bytes of source array
- *  @param[in]   count     Size of memory copy in bytes
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   dstArray   Destination memory address
- *  @param[in]   dstOffset  Offset in bytes of destination array
- *  @param[in]   srcHost    Source host pointer
- *  @param[in]   count      Size of memory copy in bytes
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   p   3D memory copy parameters
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p);
-/**
- *  @brief Copies data between host and device asynchronously.
- *
- *  @param[in]   p        3D memory copy parameters
- *  @param[in]   stream   Stream to use
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms* p, hipStream_t stream __dparm(0));
-/**
- *  @brief Copies data between host and device.
- *
- *  @param[in]   pCopy   3D memory copy parameters
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy);
-/**
- *  @brief Copies data between host and device asynchronously.
- *
- *  @param[in]   pCopy    3D memory copy parameters
- *  @param[in]   stream   Stream to use
- *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
- *  #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
- *
- *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
- * hipMemcpyAsync
- */
-hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream);
-// doxygen end Memory
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup PeerToPeer PeerToPeer Device Memory Access
- *  @{
- *  @warning PeerToPeer support is experimental.
- *  This section describes the PeerToPeer device memory access functions of HIP runtime API.
- */
-/**
- * @brief Determine if a device can access a peer's memory.
- *
- * @param [out] canAccessPeer Returns the peer access capability (0 or 1)
- * @param [in] device - device from where memory may be accessed.
- * @param [in] peerDevice - device where memory is physically located
- *
- * Returns "1" in @p canAccessPeer if the specified @p device is capable
- * of directly accessing memory physically located on peerDevice , or "0" if not.
- *
- * Returns "0" in @p canAccessPeer if deviceId == peerDeviceId, and both are valid devices : a
- * device is not a peer of itself.
- *
- * @returns #hipSuccess,
- * @returns #hipErrorInvalidDevice if deviceId or peerDeviceId are not valid devices
- */
-hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId);
-/**
- * @brief Enable direct access from current device's virtual address space to memory allocations
- * physically located on a peer device.
- *
- * Memory which already allocated on peer device will be mapped into the address space of the
- * current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
- * the address space of the current device when the memory is allocated. The peer memory remains
- * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
- *
- *
- * @param [in] peerDeviceId
- * @param [in] flags
- *
- * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
- * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
- */
-hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags);
-/**
- * @brief Disable direct access from current device's virtual address space to memory allocations
- * physically located on a peer device.
- *
- * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
- * enabled from the current device.
- *
- * @param [in] peerDeviceId
- *
- * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
- */
-hipError_t hipDeviceDisablePeerAccess(int peerDeviceId);
-/**
- * @brief Get information on memory allocations.
- *
- * @param [out] pbase - BAse pointer address
- * @param [out] psize - Size of allocation
- * @param [in]  dptr- Device Pointer
- *
- * @returns #hipSuccess, #hipErrorInvalidDevicePointer
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
-#ifndef USE_PEER_NON_UNIFIED
-#define USE_PEER_NON_UNIFIED 1
-#endif
-#if USE_PEER_NON_UNIFIED == 1
-/**
- * @brief Copies memory from one device to memory on another device.
- *
- * @param [out] dst - Destination device pointer.
- * @param [in] dstDeviceId - Destination device
- * @param [in] src - Source device pointer
- * @param [in] srcDeviceId - Source device
- * @param [in] sizeBytes - Size of memory copy in bytes
- *
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
- */
-hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId,
-                         size_t sizeBytes);
-/**
- * @brief Copies memory from one device to memory on another device.
- *
- * @param [out] dst - Destination device pointer.
- * @param [in] dstDevice - Destination device
- * @param [in] src - Source device pointer
- * @param [in] srcDevice - Source device
- * @param [in] sizeBytes - Size of memory copy in bytes
- * @param [in] stream - Stream identifier
- *
- * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
- */
-hipError_t hipMemcpyPeerAsync(void* dst, int dstDeviceId, const void* src, int srcDevice,
-                              size_t sizeBytes, hipStream_t stream __dparm(0));
-#endif
-// doxygen end PeerToPeer
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Context Context Management
- *  @{
- *  This section describes the context management functions of HIP runtime API.
- */
-/**
- *
- *  @addtogroup ContextD Context Management [Deprecated]
- *  @{
- *  @ingroup Context
- *  This section describes the deprecated context management functions of HIP runtime API.
- */
-/**
- * @brief Create a context and set it as current/ default context
- *
- * @param [out] ctx
- * @param [in] flags
- * @param [in] associated device handle
- *
- * @return #hipSuccess
- *
- * @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent,
- * hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device);
-/**
- * @brief Destroy a HIP context.
- *
- * @param [in] ctx Context to destroy
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- *
- * @see hipCtxCreate, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,hipCtxSetCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxDestroy(hipCtx_t ctx);
-/**
- * @brief Pop the current/default context and return the popped context.
- *
- * @param [out] ctx
- *
- * @returns #hipSuccess, #hipErrorInvalidContext
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxSetCurrent, hipCtxGetCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
-/**
- * @brief Push the context to be set as current/ default context
- *
- * @param [in] ctx
- *
- * @returns #hipSuccess, #hipErrorInvalidContext
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxPushCurrent(hipCtx_t ctx);
-/**
- * @brief Set the passed context as current/default
- *
- * @param [in] ctx
- *
- * @returns #hipSuccess, #hipErrorInvalidContext
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxSetCurrent(hipCtx_t ctx);
-/**
- * @brief Get the handle of the current/ default context
- *
- * @param [out] ctx
- *
- * @returns #hipSuccess, #hipErrorInvalidContext
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
-/**
- * @brief Get the handle of the device associated with current/default context
- *
- * @param [out] device
- *
- * @returns #hipSuccess, #hipErrorInvalidContext
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetDevice(hipDevice_t* device);
-/**
- * @brief Returns the approximate HIP api version.
- *
- * @param [in]  ctx Context to check
- * @param [out] apiVersion
- *
- * @return #hipSuccess
- *
- * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
- * This function always set *apiVersion to 4 as an approximation though HIP supports
- * some features which were introduced in later CUDA SDK revisions.
- * HIP apps code should not rely on the api revision number here and should
- * use arch feature flags to test device capabilities or conditional compilation.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
- * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion);
-/**
- * @brief Set Cache configuration for a specific function
- *
- * @param [out] cacheConfiguration
- *
- * @return #hipSuccess
- *
- * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
- * ignored on those architectures.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig);
-/**
- * @brief Set L1/Shared cache partition.
- *
- * @param [in] cacheConfiguration
- *
- * @return #hipSuccess
- *
- * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is
- * ignored on those architectures.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig);
-/**
- * @brief Set Shared memory bank configuration.
- *
- * @param [in] sharedMemoryConfiguration
- *
- * @return #hipSuccess
- *
- * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config);
-/**
- * @brief Get Shared memory bank configuration.
- *
- * @param [out] sharedMemoryConfiguration
- *
- * @return #hipSuccess
- *
- * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
- * ignored on those architectures.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
-/**
- * @brief Blocks until the default context has completed all preceding requested tasks.
- *
- * @return #hipSuccess
- *
- * @warning This function waits for all streams on the default context to complete execution, and
- * then returns.
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxSynchronize(void);
-/**
- * @brief Return flags used for creating default context.
- *
- * @param [out] flags
- *
- * @returns #hipSuccess
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxGetFlags(unsigned int* flags);
-/**
- * @brief Enables direct access to memory allocations in a peer context.
- *
- * Memory which already allocated on peer device will be mapped into the address space of the
- * current device.  In addition, all future memory allocations on peerDeviceId will be mapped into
- * the address space of the current device when the memory is allocated. The peer memory remains
- * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
- *
- *
- * @param [in] peerCtx
- * @param [in] flags
- *
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
- * #hipErrorPeerAccessAlreadyEnabled
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- * @warning PeerToPeer support is experimental.
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags);
-/**
- * @brief Disable direct access from current context's virtual address space to memory allocations
- * physically located on a peer context.Disables direct access to memory allocations in a peer
- * context and unregisters any registered allocations.
- *
- * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
- * enabled from the current device.
- *
- * @param [in] peerCtx
- *
- * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- * @warning PeerToPeer support is experimental.
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx);
-// doxygen end Context deprecated
-/**
- * @}
- */
-/**
- * @brief Get the state of the primary context.
- *
- * @param [in] Device to get primary context flags for
- * @param [out] Pointer to store flags
- * @param [out] Pointer to store context state; 0 = inactive, 1 = active
- *
- * @returns #hipSuccess
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
-/**
- * @brief Release the primary context on the GPU.
- *
- * @param [in] Device which primary context is released
- *
- * @returns #hipSuccess
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- * @warning This function return #hipSuccess though doesn't release the primaryCtx by design on
- * HIP/HCC path.
- */
-hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev);
-/**
- * @brief Retain the primary context on the GPU.
- *
- * @param [out] Returned context handle of the new context
- * @param [in] Device which primary context is released
- *
- * @returns #hipSuccess
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev);
-/**
- * @brief Resets the primary context on the GPU.
- *
- * @param [in] Device which primary context is reset
- *
- * @returns #hipSuccess
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev);
-/**
- * @brief Set flags for the primary context.
- *
- * @param [in] Device for which the primary context flags are set
- * @param [in] New flags for the device
- *
- * @returns #hipSuccess, #hipErrorContextAlreadyInUse
- *
- * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
- * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
- */
-hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags);
-// doxygen end Context Management
-/**
- * @}
- */
-/**
- *
- *  @defgroup Module Module Management
- *  @{
- *  This section describes the module management functions of HIP runtime API.
- *
- */
-/**
- * @brief Loads code object from file into a hipModule_t
- *
- * @param [in] fname
- * @param [out] module
- *
- * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound,
- * hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
- *
- *
- */
-hipError_t hipModuleLoad(hipModule_t* module, const char* fname);
-/**
- * @brief Frees the module
- *
- * @param [in] module
- *
- * @returns hipSuccess, hipInvalidValue
- * module is freed and the code objects associated with it are destroyed
- *
- */
-hipError_t hipModuleUnload(hipModule_t module);
-/**
- * @brief Function with kname will be extracted if present in module
- *
- * @param [in] module
- * @param [in] kname
- * @param [out] function
- *
- * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized,
- * hipErrorNotFound,
- */
-hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname);
-/**
- * @brief Find out attributes for a given function.
- *
- * @param [out] attr
- * @param [in] func
- *
- * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
- */
-hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func);
-/**
- * @brief Find out a specific attribute for a given function.
- *
- * @param [out] value
- * @param [in]  attrib
- * @param [in]  hfunc
- *
- * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
- */
-hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc);
-/**
- * @brief returns the handle of the texture reference with the name from the module.
- *
- * @param [in] hmod
- * @param [in] name
- * @param [out] texRef
- *
- * @returns hipSuccess, hipErrorNotInitialized, hipErrorNotFound, hipErrorInvalidValue
- */
-hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
-/**
- * @brief builds module from code object which resides in host memory. Image is pointer to that
- * location.
- *
- * @param [in] image
- * @param [out] module
- *
- * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
- */
-hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
-/**
- * @brief builds module from code object which resides in host memory. Image is pointer to that
- * location. Options are not used. hipModuleLoadData is called.
- *
- * @param [in] image
- * @param [out] module
- * @param [in] number of options
- * @param [in] options for JIT
- * @param [in] option values for JIT
- *
- * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
- */
-hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
-                               hipJitOption* options, void** optionValues);
-/**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
- * to kernelparams or extra
- *
- * @param [in] f         Kernel to launch.
- * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
- * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
- * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
- * @param [in] blockDimX X block dimensions specified in work-items
- * @param [in] blockDimY Y grid dimension specified in work-items
- * @param [in] blockDimZ Z grid dimension specified in work-items
- * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
- * default stream is used with associated synchronization rules.
- * @param [in] kernelParams
- * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and
- * must be in the memory layout and alignment expected by the kernel.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please
- * refer to hip_porting_driver_api.md for sample usage.
- */
-hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
-                                 unsigned int gridDimZ, unsigned int blockDimX,
-                                 unsigned int blockDimY, unsigned int blockDimZ,
-                                 unsigned int sharedMemBytes, hipStream_t stream,
-                                 void** kernelParams, void** extra);
-/**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
- * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
- *
- * @param [in] f         Kernel to launch.
- * @param [in] gridDim   Grid dimensions specified as multiple of blockDim.
- * @param [in] blockDim  Block dimensions specified in work-items
- * @param [in] kernelParams A list of kernel arguments
- * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
- * default stream is used with associated synchronization rules.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
- */
-hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
-                                      void** kernelParams, unsigned int sharedMemBytes,
-                                      hipStream_t stream);
-/**
- * @brief Launches kernels on multiple devices where thread blocks can cooperate and
- * synchronize as they execute.
- *
- * @param [in] hipLaunchParams          List of launch parameters, one per device.
- * @param [in] numDevices               Size of the launchParamsList array.
- * @param [in] flags                    Flags to control launch behavior.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
- */
-hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                 int  numDevices, unsigned int  flags);
-/**
- * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
- * on respective streams before enqueuing any other work on the specified streams from any other threads
- *
- *
- * @param [in] hipLaunchParams          List of launch parameters, one per device.
- * @param [in] numDevices               Size of the launchParamsList array.
- * @param [in] flags                    Flags to control launch behavior.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- */
-hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                              int  numDevices, unsigned int  flags);
-// doxygen end Module
-/**
- * @}
- */
-/**
- *
- *  @defgroup Occupancy Occupancy
- *  @{
- *  This section describes the occupancy functions of HIP runtime API.
- *
- */
-/**
- * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
- *
- * @param [out] gridSize           minimum grid size for maximum potential occupancy
- * @param [out] blockSize          block size for maximum potential occupancy
- * @param [in]  f                  kernel function for which occupancy is calulated
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
- */
-//TODO - Match CUoccupancyB2DSize
-hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit);
-/**
- * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
- *
- * @param [out] gridSize           minimum grid size for maximum potential occupancy
- * @param [out] blockSize          block size for maximum potential occupancy
- * @param [in]  f                  kernel function for which occupancy is calulated
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
- * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
- */
-//TODO - Match CUoccupancyB2DSize
-hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit, unsigned int  flags);
-/**
- * @brief Returns occupancy for a device function.
- *
- * @param [out] numBlocks        Returned occupancy
- * @param [in]  func             Kernel function (hipFunction) for which occupancy is calulated
- * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- */
-hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
-   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
-/**
- * @brief Returns occupancy for a device function.
- *
- * @param [out] numBlocks        Returned occupancy
- * @param [in]  f                Kernel function(hipFunction_t) for which occupancy is calulated
- * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- * @param [in]  flags            Extra flags for occupancy calculation (only default supported)
- */
-hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
-/**
- * @brief Returns occupancy for a device function.
- *
- * @param [out] numBlocks        Returned occupancy
- * @param [in]  func             Kernel function for which occupancy is calulated
- * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- */
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
-/**
- * @brief Returns occupancy for a device function.
- *
- * @param [out] numBlocks        Returned occupancy
- * @param [in]  f                Kernel function for which occupancy is calulated
- * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
- */
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault));
-/**
- * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
- *
- * @param [out] gridSize           minimum grid size for maximum potential occupancy
- * @param [out] blockSize          block size for maximum potential occupancy
- * @param [in]  f                  kernel function for which occupancy is calulated
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
- * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
- */
-hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                             const void* f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit);
-// doxygen end Occupancy
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Profiler Profiler Control[Deprecated]
- *  @{
- *  This section describes the profiler control functions of HIP runtime API.
- *
- *  @warning The cudaProfilerInitialize API format for "configFile" is not supported.
- *
- */
-// TODO - expand descriptions:
-/**
- * @brief Start recording of profiling information
- * When using this API, start the profiler with profiling disabled.  (--startdisabled)
- * @warning : hipProfilerStart API is under development.
- */
-DEPRECATED("use roctracer/rocTX instead")
-hipError_t hipProfilerStart();
-/**
- * @brief Stop recording of profiling information.
- * When using this API, start the profiler with profiling disabled.  (--startdisabled)
- * @warning : hipProfilerStop API is under development.
- */
-DEPRECATED("use roctracer/rocTX instead")
-hipError_t hipProfilerStop();
-// doxygen end profiler
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Clang Launch API to support the triple-chevron syntax
- *  @{
- *  This section describes the API to support the triple-chevron syntax.
- */
-/**
- * @brief Configure a kernel launch.
- *
- * @param [in] gridDim   grid dimension specified as multiple of blockDim.
- * @param [in] blockDim  block dimensions specified in work-items
- * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
- * default stream is used with associated synchronization rules.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- */
-hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0), hipStream_t stream __dparm(0));
-/**
- * @brief Set a kernel argument.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- * @param [in] arg    Pointer the argument in host memory.
- * @param [in] size   Size of the argument.
- * @param [in] offset Offset of the argument on the argument stack.
- *
- */
-hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
-/**
- * @brief Launch a kernel.
- *
- * @param [in] func Kernel to launch.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- */
-hipError_t hipLaunchByPtr(const void* func);
-/**
- * @brief Push configuration of a kernel launch.
- *
- * @param [in] gridDim   grid dimension specified as multiple of blockDim.
- * @param [in] blockDim  block dimensions specified in work-items
- * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
- * default stream is used with associated synchronization rules.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- */
-hipError_t __hipPushCallConfiguration(dim3 gridDim,
-                                      dim3 blockDim,
-                                      size_t sharedMem __dparm(0),
-                                      hipStream_t stream __dparm(0));
-/**
- * @brief Pop configuration of a kernel launch.
- *
- * @param [out] gridDim   grid dimension specified as multiple of blockDim.
- * @param [out] blockDim  block dimensions specified in work-items
- * @param [out] sharedMem Amount of dynamic shared memory to allocate for this kernel.  The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [out] stream    Stream where the kernel should be dispatched.  May be 0, in which case the
- * default stream is used with associated synchronization rules.
- *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- *
- */
-hipError_t __hipPopCallConfiguration(dim3 *gridDim,
-                                     dim3 *blockDim,
-                                     size_t *sharedMem,
-                                     hipStream_t *stream);
-/**
- * @brief C compliant kernel launch API
- *
- * @param [in] function_address - kernel stub function pointer.
- * @param [in] numBlocks - number of blocks
- * @param [in] dimBlocks - dimension of a block
- * @param [in] args - kernel arguments
- * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
- * HIP-Clang compiler provides support for extern shared declarations.
- * @param [in] stream - Stream where the kernel should be dispatched.  May be 0, in which case th
- *  default stream is used with associated synchronization rules.
- *
- * @returns #hipSuccess, #hipErrorInvalidValue, hipInvalidDevice
- *
- */
-hipError_t hipLaunchKernel(const void* function_address,
-                           dim3 numBlocks,
-                           dim3 dimBlocks,
-                           void** args,
-                           size_t sharedMemBytes __dparm(0),
-                           hipStream_t stream __dparm(0));
-/**
- * Copies memory for 2D arrays.
- *
- * @param pCopy           - Parameters for the memory copy
- *
- * @returns #hipSuccess, #hipErrorInvalidValue
- */
-hipError_t hipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy);
-//TODO: Move this to hip_ext.h
-hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
-                              void** args, size_t sharedMemBytes, hipStream_t stream,
-                              hipEvent_t startEvent, hipEvent_t stopEvent, int flags);
-// doxygen end Clang launch
-/**
- * @}
- */
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Texture Texture Management
- *  @{
- *  This section describes the texture management functions of HIP runtime API.
- */
-/**
- *
- *  @addtogroup TexturD Texture Management [Deprecated]
- *  @{
- *  @ingroup Texture
- *  This section describes the deprecated texture management functions of HIP runtime API.
- */
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipBindTexture(
-    size_t* offset,
-    const textureReference* tex,
-    const void* devPtr,
-    const hipChannelFormatDesc* desc,
-    size_t size __dparm(UINT_MAX));
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipBindTexture2D(
-    size_t* offset,
-    const textureReference* tex,
-    const void* devPtr,
-    const hipChannelFormatDesc* desc,
-    size_t width,
-    size_t height,
-    size_t pitch);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipBindTextureToArray(
-    const textureReference* tex,
-    hipArray_const_t array,
-    const hipChannelFormatDesc* desc);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipGetTextureAlignmentOffset(
-    size_t* offset,
-    const textureReference* texref);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipUnbindTexture(const textureReference* tex);
-// doxygen end deprecated texture management
-/**
- * @}
- */
-hipError_t hipBindTextureToMipmappedArray(
-    const textureReference* tex,
-    hipMipmappedArray_const_t mipmappedArray,
-    const hipChannelFormatDesc* desc);
- hipError_t hipGetTextureReference(
-    const textureReference** texref,
-    const void* symbol);
-hipError_t hipCreateTextureObject(
-    hipTextureObject_t* pTexObject,
-    const hipResourceDesc* pResDesc,
-    const hipTextureDesc* pTexDesc,
-    const struct hipResourceViewDesc* pResViewDesc);
-hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
-hipError_t hipGetChannelDesc(
-    hipChannelFormatDesc* desc,
-    hipArray_const_t array);
-hipError_t hipGetTextureObjectResourceDesc(
-    hipResourceDesc* pResDesc,
-    hipTextureObject_t textureObject);
-hipError_t hipGetTextureObjectResourceViewDesc(
-    struct hipResourceViewDesc* pResViewDesc,
-    hipTextureObject_t textureObject);
-hipError_t hipGetTextureObjectTextureDesc(
-    hipTextureDesc* pTexDesc,
-    hipTextureObject_t textureObject);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetAddress(
-    hipDeviceptr_t* dev_ptr,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetAddressMode(
-    enum hipTextureAddressMode* pam,
-    const textureReference* texRef,
-    int dim);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetFilterMode(
-    enum hipTextureFilterMode* pfm,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetFlags(
-    unsigned int* pFlags,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetFormat(
-    hipArray_Format* pFormat,
-    int* pNumChannels,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetMaxAnisotropy(
-    int* pmaxAnsio,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetMipmapFilterMode(
-    enum hipTextureFilterMode* pfm,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetMipmapLevelBias(
-    float* pbias,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetMipmapLevelClamp(
-    float* pminMipmapLevelClamp,
-    float* pmaxMipmapLevelClamp,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefGetMipMappedArray(
-    hipMipmappedArray_t* pArray,
-    const textureReference* texRef);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefSetAddress(
-    size_t* ByteOffset,
-    textureReference* texRef,
-    hipDeviceptr_t dptr,
-    size_t bytes);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefSetAddress2D(
-    textureReference* texRef,
-    const HIP_ARRAY_DESCRIPTOR* desc,
-    hipDeviceptr_t dptr,
-    size_t Pitch);
-hipError_t hipTexRefSetAddressMode(
-    textureReference* texRef,
-    int dim,
-    enum hipTextureAddressMode am);
-hipError_t hipTexRefSetArray(
-    textureReference* tex,
-    hipArray_const_t array,
-    unsigned int flags);
-hipError_t hipTexRefSetFilterMode(
-    textureReference* texRef,
-    enum hipTextureFilterMode fm);
-hipError_t hipTexRefSetFlags(
-    textureReference* texRef,
-    unsigned int Flags);
-hipError_t hipTexRefSetFormat(
-    textureReference* texRef,
-    hipArray_Format fmt,
-    int NumPackedComponents);
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefSetMaxAnisotropy(
-    textureReference* texRef,
-    unsigned int maxAniso);
-hipError_t hipTexObjectCreate(
-    hipTextureObject_t* pTexObject,
-    const HIP_RESOURCE_DESC* pResDesc,
-    const HIP_TEXTURE_DESC* pTexDesc,
-    const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
-hipError_t hipTexObjectDestroy(
-    hipTextureObject_t texObject);
-hipError_t hipTexObjectGetResourceDesc(
-    HIP_RESOURCE_DESC* pResDesc,
-    hipTextureObject_t texObject);
-hipError_t hipTexObjectGetResourceViewDesc(
-    HIP_RESOURCE_VIEW_DESC* pResViewDesc,
-    hipTextureObject_t texObject);
-hipError_t hipTexObjectGetTextureDesc(
-    HIP_TEXTURE_DESC* pTexDesc,
-    hipTextureObject_t texObject);
-// doxygen end Texture management
-/**
- * @}
- */
-// The following are not supported.
-DEPRECATED(DEPRECATED_MSG)
-hipError_t hipTexRefSetBorderColor(
-    textureReference* texRef,
-    float* pBorderColor);
-hipError_t hipTexRefSetMipmapFilterMode(
-    textureReference* texRef,
-    enum hipTextureFilterMode fm);
-hipError_t hipTexRefSetMipmapLevelBias(
-    textureReference* texRef,
-    float bias);
-hipError_t hipTexRefSetMipmapLevelClamp(
-    textureReference* texRef,
-    float minMipMapLevelClamp,
-    float maxMipMapLevelClamp);
-hipError_t hipTexRefSetMipmappedArray(
-    textureReference* texRef,
-    struct hipMipmappedArray* mipmappedArray,
-    unsigned int Flags);
-hipError_t hipMipmappedArrayCreate(
-    hipMipmappedArray_t* pHandle,
-    HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
-    unsigned int numMipmapLevels);
-hipError_t hipMipmappedArrayDestroy(
-    hipMipmappedArray_t hMipmappedArray);
-hipError_t hipMipmappedArrayGetLevel(
-    hipArray_t* pLevelArray,
-    hipMipmappedArray_t hMipMappedArray,
-    unsigned int level);
-/**
- *-------------------------------------------------------------------------------------------------
- *-------------------------------------------------------------------------------------------------
- *  @defgroup Runtime Runtime Compilation
- *  @{
- *  This section describes the runtime compilation functions of HIP runtime API.
- *
- */
-// This group is for HIPrtc
-
-// doxygen end Runtime
-/**
- * @}
- */
-
-/**
- * Callback/Activity API
- */
-hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg);
-hipError_t hipRemoveApiCallback(uint32_t id);
-hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg);
-hipError_t hipRemoveActivityCallback(uint32_t id);
-const char* hipApiName(uint32_t id);
-const char* hipKernelNameRef(const hipFunction_t f);
-const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream);
-int hipGetStreamDeviceId(hipStream_t stream);
-#ifdef __cplusplus
-/**
- * An opaque value that represents a hip graph
- */
-class hipGraph;
-typedef hipGraph* hipGraph_t;
-/**
- * An opaque value that represents a hip graph node
- */
-class hipGraphNode;
-typedef hipGraphNode* hipGraphNode_t;
-/**
- * An opaque value that represents a hip graph Exec
- */
-class hipGraphExec;
-typedef hipGraphExec* hipGraphExec_t;
-typedef enum hipGraphNodeType {
-  hipGraphNodeTypeKernel = 1,             ///< GPU kernel node
-  hipGraphNodeTypeMemcpy = 2,             ///< Memcpy 3D node
-  hipGraphNodeTypeMemset = 3,             ///< Memset 1D node
-  hipGraphNodeTypeHost = 4,               ///< Host (executable) node
-  hipGraphNodeTypeGraph = 5,              ///< Node which executes an embedded graph
-  hipGraphNodeTypeEmpty = 6,              ///< Empty (no-op) node
-  hipGraphNodeTypeWaitEvent = 7,          ///< External event wait node
-  hipGraphNodeTypeEventRecord = 8,        ///< External event record node
-  hipGraphNodeTypeMemcpy1D = 9,           ///< Memcpy 1D node
-  hipGraphNodeTypeMemcpyFromSymbol = 10,  ///< MemcpyFromSymbol node
-  hipGraphNodeTypeMemcpyToSymbol = 11,    ///< MemcpyToSymbol node
-  hipGraphNodeTypeCount
-} hipGraphNodeType;
-typedef void (*hipHostFn_t)(void* userData);
-typedef struct hipHostNodeParams {
-  hipHostFn_t fn;
-  void* userData;
-} hipHostNodeParams;
-typedef struct hipKernelNodeParams {
-  dim3 blockDim;
-  void** extra;
-  void* func;
-  dim3 gridDim;
-  void** kernelParams;
-  unsigned int sharedMemBytes;
-} hipKernelNodeParams;
-typedef struct hipMemsetParams {
-  void* dst;
-  unsigned int elementSize;
-  size_t height;
-  size_t pitch;
-  unsigned int value;
-  size_t width;
-} hipMemsetParams;
-enum hipGraphExecUpdateResult {
-  hipGraphExecUpdateSuccess = 0x0,  ///< The update succeeded
-  hipGraphExecUpdateError = 0x1,  ///< The update failed for an unexpected reason which is described
-                                  ///< in the return value of the function
-  hipGraphExecUpdateErrorTopologyChanged = 0x2,  ///< The update failed because the topology changed
-  hipGraphExecUpdateErrorNodeTypeChanged = 0x3,  ///< The update failed because a node type changed
-  hipGraphExecUpdateErrorFunctionChanged =
-      0x4,  ///< The update failed because the function of a kernel node changed
-  hipGraphExecUpdateErrorParametersChanged =
-      0x5,  ///< The update failed because the parameters changed in a way that is not supported
-  hipGraphExecUpdateErrorNotSupported =
-      0x6,  ///< The update failed because something about the node is not supported
-  hipGraphExecUpdateErrorUnsupportedFunctionChange = 0x7
-};
-enum hipStreamCaptureMode {
-  hipStreamCaptureModeGlobal = 0,
-  hipStreamCaptureModeThreadLocal,
-  hipStreamCaptureModeRelaxed
-};
-enum hipStreamCaptureStatus {
-  hipStreamCaptureStatusNone = 0,    ///< Stream is not capturing
-  hipStreamCaptureStatusActive,      ///< Stream is actively capturing
-  hipStreamCaptureStatusInvalidated  ///< Stream is part of a capture sequence that has been
-                                     ///< invalidated, but not terminated
-};
-hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode);
-hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
-// Creates a graph.
-hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
-// Destroys a graph.
-hipError_t hipGraphDestroy(hipGraph_t graph);
-// Destroys an executable graph.
-hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec);
-// Creates an executable graph from a graph.
-hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
-                               hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize);
-// Launches an executable graph in a stream.
-hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
-// Creates a kernel execution node and adds it to a graph.
-hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                 const hipKernelNodeParams* pNodeParams);
-// Creates a memcpy node and adds it to a graph.
-hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                 const hipMemcpy3DParms* pCopyParams);
-// Creates a memset node and adds it to a graph.
-hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                 const hipMemsetParams* pMemsetParams);
-#endif
-// doxygen end graph API
-/**
- * @}
- */
-#ifdef __cplusplus
-} /* extern "c" */
-#endif
-#if USE_PROF_API
-#include <hip/amd_detail/hip_prof_str.h>
-#endif
-#ifdef __cplusplus
-#if defined(__clang__) && defined(__HIP__)
-template <typename T>
-static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-    T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
-    return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
-}
-template <typename T>
-static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
-    T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int  flags = 0 ) {
-    return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
-}
-#endif // defined(__clang__) && defined(__HIP__)
-template <typename T>
-hipError_t hipGetSymbolAddress(void** devPtr, const T &symbol) {
-  return ::hipGetSymbolAddress(devPtr, (const void *)&symbol);
-}
-template <typename T>
-hipError_t hipGetSymbolSize(size_t* size, const T &symbol) {
-  return ::hipGetSymbolSize(size, (const void *)&symbol);
-}
-template <typename T>
-hipError_t hipMemcpyToSymbol(const T& symbol, const void* src, size_t sizeBytes,
-                             size_t offset __dparm(0),
-                             hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
-  return ::hipMemcpyToSymbol((const void*)&symbol, src, sizeBytes, offset, kind);
-}
-template <typename T>
-hipError_t hipMemcpyToSymbolAsync(const T& symbol, const void* src, size_t sizeBytes, size_t offset,
-                                  hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
-  return ::hipMemcpyToSymbolAsync((const void*)&symbol, src, sizeBytes, offset, kind, stream);
-}
-template <typename T>
-hipError_t hipMemcpyFromSymbol(void* dst, const T &symbol,
-                               size_t sizeBytes, size_t offset __dparm(0),
-                               hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
-  return ::hipMemcpyFromSymbol(dst, (const void*)&symbol, sizeBytes, offset, kind);
-}
-template <typename T>
-hipError_t hipMemcpyFromSymbolAsync(void* dst, const T& symbol, size_t sizeBytes, size_t offset,
-                                    hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
-  return ::hipMemcpyFromSymbolAsync(dst, (const void*)&symbol, sizeBytes, offset, kind, stream);
-}
-template <class T>
-inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
-    return hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
-}
-template <class T>
-inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
-    return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
-}
-template <typename F>
-inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                                    F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
-return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kernel, dynSharedMemPerBlk, blockSizeLimit);
-}
-template <class T>
-inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
-                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
-    return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim,
-                                      blockDim, kernelParams, sharedMemBytes, stream);
-}
-template <class T>
-inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                        unsigned int  numDevices, unsigned int  flags = 0) {
-    return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
-}
-template <class T>
-inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                     unsigned int  numDevices, unsigned int  flags = 0) {
-    return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
-}
-hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
-hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
-template <class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
-                                        const void* devPtr, size_t size = UINT_MAX) {
-    return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size);
-}
-template <class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t
-    hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex, const void* devPtr,
-                   const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
-    return hipBindTexture(offset, &tex, devPtr, &desc, size);
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipBindTexture2D(
-    size_t *offset,
-    const struct texture<T, dim, readMode> &tex,
-    const void *devPtr,
-    size_t width,
-    size_t height,
-    size_t pitch)
-{
-    return hipBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipBindTexture2D(
-  size_t *offset,
-  const struct texture<T, dim, readMode> &tex,
-  const void *devPtr,
-  const struct hipChannelFormatDesc &desc,
-  size_t width,
-  size_t height,
-  size_t pitch)
-{
-  return hipBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipBindTextureToArray(
-    const struct texture<T, dim, readMode> &tex,
-    hipArray_const_t array)
-{
-    struct hipChannelFormatDesc desc;
-    hipError_t err = hipGetChannelDesc(&desc, array);
-    return (err == hipSuccess) ? hipBindTextureToArray(&tex, array, &desc) : err;
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipBindTextureToArray(
-    const struct texture<T, dim, readMode> &tex,
-    hipArray_const_t array,
-    const struct hipChannelFormatDesc &desc)
-{
-    return hipBindTextureToArray(&tex, array, &desc);
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-static inline hipError_t hipBindTextureToMipmappedArray(
-    const struct texture<T, dim, readMode> &tex,
-    hipMipmappedArray_const_t mipmappedArray)
-{
-    struct hipChannelFormatDesc desc;
-    hipArray_t levelArray;
-    hipError_t err = hipGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
-    if (err != hipSuccess) {
-        return err;
-    }
-    err = hipGetChannelDesc(&desc, levelArray);
-    return (err == hipSuccess) ? hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc) : err;
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-static inline hipError_t hipBindTextureToMipmappedArray(
-    const struct texture<T, dim, readMode> &tex,
-    hipMipmappedArray_const_t mipmappedArray,
-    const struct hipChannelFormatDesc &desc)
-{
-    return hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
-}
-template<class T, int dim, enum hipTextureReadMode readMode>
-DEPRECATED(DEPRECATED_MSG)
-static inline hipError_t hipUnbindTexture(
-    const struct texture<T, dim, readMode> &tex)
-{
-    return hipUnbindTexture(&tex);
-}
-#endif // __cplusplus
-#ifdef __GNUC__
-#pragma GCC visibility pop
-#endif
-// doxygen end HIP API
-/**
- *   @}
- */
-
+#include "hip/amd_detail/hip_runtime_api.h"
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include "hip/nvidia_detail/nvidia_hip_runtime_api.h"
+#include "hip/nvidia_detail/hip_runtime_api.h"
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
@@ -4382,4 +495,5 @@ static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
     return hipMallocManaged((void**)devPtr, size, flags);
 }
 #endif
+
 #endif
diff --git a/include/hip/hip_texture_types.h b/include/hip/hip_texture_types.h
index af9ec90eee..308da167a0 100644
--- a/include/hip/hip_texture_types.h
+++ b/include/hip/hip_texture_types.h
@@ -25,9 +25,9 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_hip_texture_types.h>
+#include <hip/amd_detail/hip_texture_types.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/nvidia_detail/nvidia_hip_texture_types.h>
+#include <hip/nvidia_detail/hip_texture_types.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/hip_vector_types.h b/include/hip/hip_vector_types.h
index aba545e5d4..9aa27dbbf1 100644
--- a/include/hip/hip_vector_types.h
+++ b/include/hip/hip_vector_types.h
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #if __cplusplus
-#include <hip/amd_detail/amd_hip_vector_types.h>
+#include <hip/amd_detail/hip_vector_types.h>
 #endif
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include <vector_types.h>
diff --git a/include/hip/hiprtc.h b/include/hip/hiprtc.h
index 471dd3714b..6dd11de600 100644
--- a/include/hip/hiprtc.h
+++ b/include/hip/hiprtc.h
@@ -24,9 +24,9 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-    #include <hip/amd_detail/amd_hiprtc.h>
+    #include <hip/amd_detail/hiprtc.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-    #include <hip/nvidia_detail/nvidia_hiprtc.h>
+    #include <hip/nvidia_detail/hiprtc.h>
 #else
     #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
 #endif
diff --git a/include/hip/library_types.h b/include/hip/library_types.h
index a7f5177f5d..805a385644 100644
--- a/include/hip/library_types.h
+++ b/include/hip/library_types.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_library_types.h>
+#include <hip/amd_detail/library_types.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include "library_types.h"
 #else
diff --git a/include/hip/math_functions.h b/include/hip/math_functions.h
index efb704f876..7488052e73 100644
--- a/include/hip/math_functions.h
+++ b/include/hip/math_functions.h
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_math_functions.h>
+#include <hip/amd_detail/math_functions.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 //#include <hip/nvidia_detail/math_functions.h>
 #else
diff --git a/include/hip/nvidia_detail/channel_descriptor.h b/include/hip/nvidia_detail/channel_descriptor.h
new file mode 100644
index 0000000000..7eb0e65fda
--- /dev/null
+++ b/include/hip/nvidia_detail/channel_descriptor.h
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include "channel_descriptor.h"
+
+#endif
diff --git a/include/hip/nvidia_detail/hip_complex.h b/include/hip/nvidia_detail/hip_complex.h
new file mode 100644
index 0000000000..10a53d1743
--- /dev/null
+++ b/include/hip/nvidia_detail/hip_complex.h
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+
+#include "cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCmul(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    return cuCfma(p, q, r);
+}
+
+#endif
diff --git a/include/hip/nvidia_detail/hip_cooperative_groups.h b/include/hip/nvidia_detail/hip_cooperative_groups.h
new file mode 100644
index 0000000000..fc98ae2281
--- /dev/null
+++ b/include/hip/nvidia_detail/hip_cooperative_groups.h
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/include/hip/nvidia_detail/hip_runtime.h b/include/hip/nvidia_detail/hip_runtime.h
new file mode 100644
index 0000000000..a42fecc611
--- /dev/null
+++ b/include/hip/nvidia_detail/hip_runtime.h
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+
+#include <cuda_runtime.h>
+
+#include <hip/hip_runtime_api.h>
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+
+typedef int hipLaunchParm;
+
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
+#define hipReadModeElementType cudaReadModeElementType
+
+#ifdef __CUDA_ARCH__
+
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
+
+#endif
+
+#ifdef __CUDACC__
+
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#define HIP_SYMBOL(X) &X
+
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort_()                                                                                    \
+    { asm("trap;"); }
+#undef assert
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!COND) {                                                                               \
+            abort_();                                                                               \
+        }                                                                                          \
+    }
+#endif
+
+#define __clock() clock()
+#define __clock64() clock64()
+
+#endif
+
+#endif
diff --git a/include/hip/nvidia_detail/hip_runtime_api.h b/include/hip/nvidia_detail/hip_runtime_api.h
new file mode 100644
index 0000000000..66e4743abd
--- /dev/null
+++ b/include/hip/nvidia_detail/hip_runtime_api.h
@@ -0,0 +1,2195 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_fp16.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#define __dparm(x) = x
+#else
+#define __dparm(x)
+#endif
+
+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED __attribute__((deprecated))
+#else
+#define __HIP_DEPRECATED
+#endif
+
+
+// TODO -move to include/hip_runtime_api.h as a common implementation.
+/**
+ * Memory copy types
+ *
+ */
+typedef enum hipMemcpyKind {
+    hipMemcpyHostToHost,
+    hipMemcpyHostToDevice,
+    hipMemcpyDeviceToHost,
+    hipMemcpyDeviceToDevice,
+    hipMemcpyDefault
+} hipMemcpyKind;
+
+typedef enum hipMemoryAdvise {
+    hipMemAdviseSetReadMostly,
+    hipMemAdviseUnsetReadMostly,
+    hipMemAdviseSetPreferredLocation,
+    hipMemAdviseUnsetPreferredLocation,
+    hipMemAdviseSetAccessedBy,
+    hipMemAdviseUnsetAccessedBy
+} hipMemoryAdvise;
+
+// hipDataType
+#define hipDataType cudaDataType
+#define HIP_R_16F CUDA_R_16F
+#define HIP_R_32F CUDA_R_32F
+#define HIP_R_64F CUDA_R_64F
+#define HIP_C_16F CUDA_C_16F
+#define HIP_C_32F CUDA_C_32F
+#define HIP_C_64F CUDA_C_64F
+
+// hipLibraryPropertyType
+#define hipLibraryPropertyType libraryPropertyType
+#define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
+#define HIP_LIBRARY_MINOR_VERSION MINOR_VERSION
+#define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
+
+#define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
+#define HIP_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR
+
+//hipArray_Format
+#define HIP_AD_FORMAT_UNSIGNED_INT8   CU_AD_FORMAT_UNSIGNED_INT8
+#define HIP_AD_FORMAT_UNSIGNED_INT16  CU_AD_FORMAT_UNSIGNED_INT16
+#define HIP_AD_FORMAT_UNSIGNED_INT32  CU_AD_FORMAT_UNSIGNED_INT32
+#define HIP_AD_FORMAT_SIGNED_INT8     CU_AD_FORMAT_SIGNED_INT8
+#define HIP_AD_FORMAT_SIGNED_INT16    CU_AD_FORMAT_SIGNED_INT16
+#define HIP_AD_FORMAT_SIGNED_INT32    CU_AD_FORMAT_SIGNED_INT32
+#define HIP_AD_FORMAT_HALF            CU_AD_FORMAT_HALF
+#define HIP_AD_FORMAT_FLOAT           CU_AD_FORMAT_FLOAT
+
+// hipArray_Format
+#define hipArray_Format CUarray_format
+
+inline static CUarray_format hipArray_FormatToCUarray_format(
+    hipArray_Format format) {
+    switch (format) {
+        case HIP_AD_FORMAT_UNSIGNED_INT8:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+        case HIP_AD_FORMAT_UNSIGNED_INT16:
+            return CU_AD_FORMAT_UNSIGNED_INT16;
+        case HIP_AD_FORMAT_UNSIGNED_INT32:
+            return CU_AD_FORMAT_UNSIGNED_INT32;
+        case HIP_AD_FORMAT_SIGNED_INT8:
+            return CU_AD_FORMAT_SIGNED_INT8;
+        case HIP_AD_FORMAT_SIGNED_INT16:
+            return CU_AD_FORMAT_SIGNED_INT16;
+        case HIP_AD_FORMAT_SIGNED_INT32:
+            return CU_AD_FORMAT_SIGNED_INT32;
+        case HIP_AD_FORMAT_HALF:
+            return CU_AD_FORMAT_HALF;
+        case HIP_AD_FORMAT_FLOAT:
+            return CU_AD_FORMAT_FLOAT;
+        default:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+    }
+}
+
+#define HIP_TR_ADDRESS_MODE_WRAP   CU_TR_ADDRESS_MODE_WRAP
+#define HIP_TR_ADDRESS_MODE_CLAMP  CU_TR_ADDRESS_MODE_CLAMP
+#define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
+#define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
+
+// hipAddress_mode
+#define hipAddress_mode CUaddress_mode
+
+inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
+    hipAddress_mode mode) {
+    switch (mode) {
+        case HIP_TR_ADDRESS_MODE_WRAP:
+            return CU_TR_ADDRESS_MODE_WRAP;
+        case HIP_TR_ADDRESS_MODE_CLAMP:
+            return CU_TR_ADDRESS_MODE_CLAMP;
+        case HIP_TR_ADDRESS_MODE_MIRROR:
+            return CU_TR_ADDRESS_MODE_MIRROR;
+        case HIP_TR_ADDRESS_MODE_BORDER:
+            return CU_TR_ADDRESS_MODE_BORDER;
+        default:
+            return CU_TR_ADDRESS_MODE_WRAP;
+    }
+}
+
+#define HIP_TR_FILTER_MODE_POINT   CU_TR_FILTER_MODE_POINT
+#define HIP_TR_FILTER_MODE_LINEAR  CU_TR_FILTER_MODE_LINEAR
+
+// hipFilter_mode
+#define hipFilter_mode CUfilter_mode
+
+inline static CUfilter_mode hipFilter_mode_enumToCUfilter_mode(
+    hipFilter_mode mode) {
+    switch (mode) {
+        case HIP_TR_FILTER_MODE_POINT:
+            return CU_TR_FILTER_MODE_POINT;
+        case HIP_TR_FILTER_MODE_LINEAR:
+            return CU_TR_FILTER_MODE_LINEAR;
+        default:
+            return CU_TR_FILTER_MODE_POINT;
+    }
+}
+
+//hipResourcetype
+#define HIP_RESOURCE_TYPE_ARRAY            CU_RESOURCE_TYPE_ARRAY
+#define HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY  CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+#define HIP_RESOURCE_TYPE_LINEAR           CU_RESOURCE_TYPE_LINEAR
+#define HIP_RESOURCE_TYPE_PITCH2D          CU_RESOURCE_TYPE_PITCH2D
+
+// hipResourcetype
+#define hipResourcetype CUresourcetype
+
+inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
+    hipResourcetype resType) {
+    switch (resType) {
+        case HIP_RESOURCE_TYPE_ARRAY:
+            return CU_RESOURCE_TYPE_ARRAY;
+        case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+            return CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+        case HIP_RESOURCE_TYPE_LINEAR:
+            return CU_RESOURCE_TYPE_LINEAR;
+        case HIP_RESOURCE_TYPE_PITCH2D:
+            return CU_RESOURCE_TYPE_PITCH2D;
+        default:
+            return CU_RESOURCE_TYPE_ARRAY;
+    }
+}
+
+#define hipTexRef CUtexref
+#define hiparray CUarray
+
+// hipTextureAddressMode
+typedef enum cudaTextureAddressMode hipTextureAddressMode;
+#define hipAddressModeWrap cudaAddressModeWrap
+#define hipAddressModeClamp cudaAddressModeClamp
+#define hipAddressModeMirror cudaAddressModeMirror
+#define hipAddressModeBorder cudaAddressModeBorder
+
+// hipTextureFilterMode
+typedef enum cudaTextureFilterMode hipTextureFilterMode;
+#define hipFilterModePoint cudaFilterModePoint
+#define hipFilterModeLinear cudaFilterModeLinear
+
+// hipTextureReadMode
+typedef enum cudaTextureReadMode hipTextureReadMode;
+#define hipReadModeElementType cudaReadModeElementType
+#define hipReadModeNormalizedFloat cudaReadModeNormalizedFloat
+
+// hipChannelFormatKind
+typedef enum cudaChannelFormatKind hipChannelFormatKind;
+#define hipChannelFormatKindSigned      cudaChannelFormatKindSigned
+#define hipChannelFormatKindUnsigned    cudaChannelFormatKindUnsigned
+#define hipChannelFormatKindFloat       cudaChannelFormatKindFloat
+#define hipChannelFormatKindNone        cudaChannelFormatKindNone
+
+// hipMemRangeAttribute
+typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
+#define hipMemRangeAttributeReadMostly cudaMemRangeAttributeReadMostly
+#define hipMemRangeAttributePreferredLocation cudaMemRangeAttributePreferredLocation
+#define hipMemRangeAttributeAccessedBy cudaMemRangeAttributeAccessedBy
+#define hipMemRangeAttributeLastPrefetchLocation cudaMemRangeAttributeLastPrefetchLocation
+
+#define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
+#define hipBoundaryModeZero cudaBoundaryModeZero
+#define hipBoundaryModeTrap cudaBoundaryModeTrap
+#define hipBoundaryModeClamp cudaBoundaryModeClamp
+
+// hipFuncCache
+#define hipFuncCachePreferNone cudaFuncCachePreferNone
+#define hipFuncCachePreferShared cudaFuncCachePreferShared
+#define hipFuncCachePreferL1 cudaFuncCachePreferL1
+#define hipFuncCachePreferEqual cudaFuncCachePreferEqual
+
+// hipResourceType
+#define hipResourceType cudaResourceType
+#define hipResourceTypeArray cudaResourceTypeArray
+#define hipResourceTypeMipmappedArray cudaResourceTypeMipmappedArray
+#define hipResourceTypeLinear cudaResourceTypeLinear
+#define hipResourceTypePitch2D cudaResourceTypePitch2D
+//
+// hipErrorNoDevice.
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault cudaEventDefault
+#define hipEventBlockingSync cudaEventBlockingSync
+#define hipEventDisableTiming cudaEventDisableTiming
+#define hipEventInterprocess cudaEventInterprocess
+#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
+#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
+
+
+#define hipHostMallocDefault cudaHostAllocDefault
+#define hipHostMallocPortable cudaHostAllocPortable
+#define hipHostMallocMapped cudaHostAllocMapped
+#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent 0x0
+#define hipHostMallocNonCoherent 0x0
+
+#define hipMemAttachGlobal cudaMemAttachGlobal
+#define hipMemAttachHost cudaMemAttachHost
+#define hipMemAttachSingle cudaMemAttachSingle
+
+#define hipHostRegisterDefault cudaHostRegisterDefault
+#define hipHostRegisterPortable cudaHostRegisterPortable
+#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterIoMemory cudaHostRegisterIoMemory
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
+#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
+#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
+
+#define hipOccupancyDefault cudaOccupancyDefault
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync    \
+        cudaCooperativeLaunchMultiDeviceNoPreSync
+#define hipCooperativeLaunchMultiDeviceNoPostSync   \
+        cudaCooperativeLaunchMultiDeviceNoPostSync
+
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+
+typedef cudaEvent_t hipEvent_t;
+typedef cudaStream_t hipStream_t;
+typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
+typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
+typedef enum cudaLimit hipLimit_t;
+typedef enum cudaFuncAttribute hipFuncAttribute;
+typedef enum cudaFuncCache hipFuncCache_t;
+typedef CUcontext hipCtx_t;
+typedef enum cudaSharedMemConfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
+typedef CUdevice hipDevice_t;
+typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
+#define hipDevP2PAttrPerformanceRank cudaDevP2PAttrPerformanceRank
+#define hipDevP2PAttrAccessSupported cudaDevP2PAttrAccessSupported
+#define hipDevP2PAttrNativeAtomicSupported cudaDevP2PAttrNativeAtomicSupported
+#define hipDevP2PAttrHipArrayAccessSupported cudaDevP2PAttrCudaArrayAccessSupported
+#define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
+#define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
+
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+typedef struct cudaArray hipArray;
+typedef struct cudaArray* hipArray_t;
+typedef struct cudaArray* hipArray_const_t;
+typedef struct cudaFuncAttributes hipFuncAttributes;
+typedef struct cudaLaunchParams hipLaunchParams;
+#define hipFunction_attribute CUfunction_attribute
+#define hip_Memcpy2D CUDA_MEMCPY2D
+#define HIP_MEMCPY3D CUDA_MEMCPY3D
+#define hipMemcpy3DParms cudaMemcpy3DParms
+#define hipArrayDefault cudaArrayDefault
+#define hipArrayLayered cudaArrayLayered
+#define hipArraySurfaceLoadStore cudaArraySurfaceLoadStore
+#define hipArrayCubemap cudaArrayCubemap
+#define hipArrayTextureGather cudaArrayTextureGather
+
+typedef cudaTextureObject_t hipTextureObject_t;
+typedef cudaSurfaceObject_t hipSurfaceObject_t;
+#define hipTextureType1D cudaTextureType1D
+#define hipTextureType1DLayered cudaTextureType1DLayered
+#define hipTextureType2D cudaTextureType2D
+#define hipTextureType2DLayered cudaTextureType2DLayered
+#define hipTextureType3D cudaTextureType3D
+
+#define hipDeviceScheduleAuto cudaDeviceScheduleAuto
+#define hipDeviceScheduleSpin cudaDeviceScheduleSpin
+#define hipDeviceScheduleYield cudaDeviceScheduleYield
+#define hipDeviceScheduleBlockingSync cudaDeviceScheduleBlockingSync
+#define hipDeviceScheduleMask cudaDeviceScheduleMask
+#define hipDeviceMapHost cudaDeviceMapHost
+#define hipDeviceLmemResizeToMax cudaDeviceLmemResizeToMax
+
+#define hipCpuDeviceId cudaCpuDeviceId
+#define hipInvalidDeviceId cudaInvalidDeviceId
+typedef struct cudaExtent hipExtent;
+typedef struct cudaPitchedPtr hipPitchedPtr;
+#define make_hipExtent make_cudaExtent
+#define make_hipPos make_cudaPos
+#define make_hipPitchedPtr make_cudaPitchedPtr
+// Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault cudaStreamDefault
+#define hipStreamNonBlocking cudaStreamNonBlocking
+
+typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
+typedef struct cudaResourceDesc hipResourceDesc;
+typedef struct cudaTextureDesc hipTextureDesc;
+typedef struct cudaResourceViewDesc hipResourceViewDesc;
+// adding code for hipmemSharedConfig
+#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
+#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
+#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
+
+//Function Attributes
+#define HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+#define HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_NUM_REGS CU_FUNC_ATTRIBUTE_NUM_REGS
+#define HIP_FUNC_ATTRIBUTE_PTX_VERSION CU_FUNC_ATTRIBUTE_PTX_VERSION
+#define HIP_FUNC_ATTRIBUTE_BINARY_VERSION CU_FUNC_ATTRIBUTE_BINARY_VERSION
+#define HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+#define HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+#define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
+
+#if CUDA_VERSION >= 9000
+#define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
+#endif // CUDA_VERSION >= 9000
+
+inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
+    switch (cuError) {
+        case cudaSuccess:
+            return hipSuccess;
+        case cudaErrorProfilerDisabled:
+            return hipErrorProfilerDisabled;
+        case cudaErrorProfilerNotInitialized:
+            return hipErrorProfilerNotInitialized;
+        case cudaErrorProfilerAlreadyStarted:
+            return hipErrorProfilerAlreadyStarted;
+        case cudaErrorProfilerAlreadyStopped:
+            return hipErrorProfilerAlreadyStopped;
+        case cudaErrorInsufficientDriver:
+            return hipErrorInsufficientDriver;
+        case cudaErrorUnsupportedLimit:
+            return hipErrorUnsupportedLimit;
+        case cudaErrorPeerAccessUnsupported:
+            return hipErrorPeerAccessUnsupported;
+        case cudaErrorInvalidGraphicsContext:
+            return hipErrorInvalidGraphicsContext;
+        case cudaErrorSharedObjectSymbolNotFound:
+            return hipErrorSharedObjectSymbolNotFound;
+        case cudaErrorSharedObjectInitFailed:
+            return hipErrorSharedObjectInitFailed;
+        case cudaErrorOperatingSystem:
+            return hipErrorOperatingSystem;
+        case cudaErrorSetOnActiveProcess:
+            return hipErrorSetOnActiveProcess;
+        case cudaErrorIllegalAddress:
+            return hipErrorIllegalAddress;
+        case cudaErrorInvalidSymbol:
+            return hipErrorInvalidSymbol;
+        case cudaErrorMissingConfiguration:
+            return hipErrorMissingConfiguration;
+        case cudaErrorMemoryAllocation:
+            return hipErrorOutOfMemory;
+        case cudaErrorInitializationError:
+            return hipErrorNotInitialized;
+        case cudaErrorLaunchFailure:
+            return hipErrorLaunchFailure;
+        case cudaErrorCooperativeLaunchTooLarge:
+            return hipErrorCooperativeLaunchTooLarge;
+        case cudaErrorPriorLaunchFailure:
+            return hipErrorPriorLaunchFailure;
+        case cudaErrorLaunchOutOfResources:
+            return hipErrorLaunchOutOfResources;
+        case cudaErrorInvalidDeviceFunction:
+            return hipErrorInvalidDeviceFunction;
+        case cudaErrorInvalidConfiguration:
+            return hipErrorInvalidConfiguration;
+        case cudaErrorInvalidDevice:
+            return hipErrorInvalidDevice;
+        case cudaErrorInvalidValue:
+            return hipErrorInvalidValue;
+        case cudaErrorInvalidDevicePointer:
+            return hipErrorInvalidDevicePointer;
+        case cudaErrorInvalidMemcpyDirection:
+            return hipErrorInvalidMemcpyDirection;
+        case cudaErrorInvalidResourceHandle:
+            return hipErrorInvalidHandle;
+        case cudaErrorNotReady:
+            return hipErrorNotReady;
+        case cudaErrorNoDevice:
+            return hipErrorNoDevice;
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case cudaErrorPeerAccessNotEnabled:
+            return hipErrorPeerAccessNotEnabled;
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case cudaErrorHostMemoryNotRegistered:
+            return hipErrorHostMemoryNotRegistered;
+        case cudaErrorMapBufferObjectFailed:
+            return hipErrorMapFailed;
+        case cudaErrorAssert:
+            return hipErrorAssert;
+        case cudaErrorNotSupported:
+            return hipErrorNotSupported;
+        case cudaErrorCudartUnloading:
+            return hipErrorDeinitialized;
+        case cudaErrorInvalidKernelImage:
+            return hipErrorInvalidImage;
+        case cudaErrorUnmapBufferObjectFailed:
+            return hipErrorUnmapFailed;
+        case cudaErrorNoKernelImageForDevice:
+            return hipErrorNoBinaryForGpu;
+        case cudaErrorECCUncorrectable:
+            return hipErrorECCNotCorrectable;
+        case cudaErrorDeviceAlreadyInUse:
+            return hipErrorContextAlreadyInUse;
+        case cudaErrorInvalidPtx:
+            return hipErrorInvalidKernelFile;
+        case cudaErrorLaunchTimeout:
+            return hipErrorLaunchTimeOut;
+#if CUDA_VERSION >= 10010
+        case cudaErrorInvalidSource:
+            return hipErrorInvalidSource;
+        case cudaErrorFileNotFound:
+            return hipErrorFileNotFound;
+        case cudaErrorSymbolNotFound:
+            return hipErrorNotFound;
+        case cudaErrorArrayIsMapped:
+            return hipErrorArrayIsMapped;
+        case cudaErrorNotMappedAsPointer:
+            return hipErrorNotMappedAsPointer;
+        case cudaErrorNotMappedAsArray:
+            return hipErrorNotMappedAsArray;
+        case cudaErrorNotMapped:
+            return hipErrorNotMapped;
+        case cudaErrorAlreadyAcquired:
+            return hipErrorAlreadyAcquired;
+        case cudaErrorAlreadyMapped:
+            return hipErrorAlreadyMapped;
+#endif
+#if CUDA_VERSION >= 10020
+        case cudaErrorDeviceUninitialized:
+            return hipErrorInvalidContext;
+#endif
+        case cudaErrorUnknown:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static hipError_t hipCUResultTohipError(CUresult cuError) {
+    switch (cuError) {
+        case CUDA_SUCCESS:
+            return hipSuccess;
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return hipErrorOutOfMemory;
+        case CUDA_ERROR_INVALID_VALUE:
+            return hipErrorInvalidValue;
+        case CUDA_ERROR_INVALID_DEVICE:
+            return hipErrorInvalidDevice;
+        case CUDA_ERROR_DEINITIALIZED:
+            return hipErrorDeinitialized;
+        case CUDA_ERROR_NO_DEVICE:
+            return hipErrorNoDevice;
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return hipErrorInvalidContext;
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return hipErrorNotInitialized;
+        case CUDA_ERROR_INVALID_HANDLE:
+            return hipErrorInvalidHandle;
+        case CUDA_ERROR_MAP_FAILED:
+            return hipErrorMapFailed;
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return hipErrorProfilerDisabled;
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return hipErrorProfilerNotInitialized;
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return hipErrorProfilerAlreadyStarted;
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return hipErrorProfilerAlreadyStopped;
+        case CUDA_ERROR_INVALID_IMAGE:
+            return hipErrorInvalidImage;
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return hipErrorContextAlreadyCurrent;
+        case CUDA_ERROR_UNMAP_FAILED:
+            return hipErrorUnmapFailed;
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return hipErrorArrayIsMapped;
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return hipErrorAlreadyMapped;
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return hipErrorNoBinaryForGpu;
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return hipErrorAlreadyAcquired;
+        case CUDA_ERROR_NOT_MAPPED:
+            return hipErrorNotMapped;
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return hipErrorNotMappedAsArray;
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return hipErrorNotMappedAsPointer;
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return hipErrorECCNotCorrectable;
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return hipErrorUnsupportedLimit;
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return hipErrorContextAlreadyInUse;
+        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+            return hipErrorPeerAccessUnsupported;
+        case CUDA_ERROR_INVALID_PTX:
+            return hipErrorInvalidKernelFile;
+        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+            return hipErrorInvalidGraphicsContext;
+        case CUDA_ERROR_INVALID_SOURCE:
+            return hipErrorInvalidSource;
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return hipErrorFileNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return hipErrorSharedObjectSymbolNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return hipErrorSharedObjectInitFailed;
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return hipErrorOperatingSystem;
+        case CUDA_ERROR_NOT_FOUND:
+            return hipErrorNotFound;
+        case CUDA_ERROR_NOT_READY:
+            return hipErrorNotReady;
+        case CUDA_ERROR_ILLEGAL_ADDRESS:
+            return hipErrorIllegalAddress;
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return hipErrorLaunchOutOfResources;
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return hipErrorLaunchTimeOut;
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return hipErrorPeerAccessNotEnabled;
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return hipErrorSetOnActiveProcess;
+        case CUDA_ERROR_ASSERT:
+            return hipErrorAssert;
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return hipErrorHostMemoryNotRegistered;
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return hipErrorLaunchFailure;
+        case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
+            return hipErrorCooperativeLaunchTooLarge;
+        case CUDA_ERROR_NOT_SUPPORTED:
+            return hipErrorNotSupported;
+        case CUDA_ERROR_UNKNOWN:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
+    switch (hError) {
+        case hipSuccess:
+            return cudaSuccess;
+        case hipErrorOutOfMemory:
+            return cudaErrorMemoryAllocation;
+        case hipErrorProfilerDisabled:
+          return cudaErrorProfilerDisabled;
+        case hipErrorProfilerNotInitialized:
+            return cudaErrorProfilerNotInitialized;
+        case hipErrorProfilerAlreadyStarted:
+            return cudaErrorProfilerAlreadyStarted;
+        case hipErrorProfilerAlreadyStopped:
+            return cudaErrorProfilerAlreadyStopped;
+        case hipErrorInvalidConfiguration:
+            return cudaErrorInvalidConfiguration;
+        case hipErrorLaunchOutOfResources:
+            return cudaErrorLaunchOutOfResources;
+        case hipErrorInvalidValue:
+            return cudaErrorInvalidValue;
+        case hipErrorInvalidHandle:
+            return cudaErrorInvalidResourceHandle;
+        case hipErrorInvalidDevice:
+            return cudaErrorInvalidDevice;
+        case hipErrorInvalidMemcpyDirection:
+            return cudaErrorInvalidMemcpyDirection;
+        case hipErrorInvalidDevicePointer:
+            return cudaErrorInvalidDevicePointer;
+        case hipErrorNotInitialized:
+            return cudaErrorInitializationError;
+        case hipErrorNoDevice:
+            return cudaErrorNoDevice;
+        case hipErrorNotReady:
+            return cudaErrorNotReady;
+        case hipErrorPeerAccessNotEnabled:
+            return cudaErrorPeerAccessNotEnabled;
+        case hipErrorPeerAccessAlreadyEnabled:
+            return cudaErrorPeerAccessAlreadyEnabled;
+        case hipErrorHostMemoryAlreadyRegistered:
+            return cudaErrorHostMemoryAlreadyRegistered;
+        case hipErrorHostMemoryNotRegistered:
+            return cudaErrorHostMemoryNotRegistered;
+        case hipErrorDeinitialized:
+            return cudaErrorCudartUnloading;
+        case hipErrorInvalidSymbol:
+            return cudaErrorInvalidSymbol;
+        case hipErrorInsufficientDriver:
+            return cudaErrorInsufficientDriver;
+        case hipErrorMissingConfiguration:
+            return cudaErrorMissingConfiguration;
+        case hipErrorPriorLaunchFailure:
+            return cudaErrorPriorLaunchFailure;
+        case hipErrorInvalidDeviceFunction:
+            return cudaErrorInvalidDeviceFunction;
+        case hipErrorInvalidImage:
+            return cudaErrorInvalidKernelImage;
+        case hipErrorInvalidContext:
+#if CUDA_VERSION >= 10020
+            return cudaErrorDeviceUninitialized;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorMapFailed:
+            return cudaErrorMapBufferObjectFailed;
+        case hipErrorUnmapFailed:
+            return cudaErrorUnmapBufferObjectFailed;
+        case hipErrorArrayIsMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorArrayIsMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorAlreadyMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNoBinaryForGpu:
+            return cudaErrorNoKernelImageForDevice;
+        case hipErrorAlreadyAcquired:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyAcquired;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsArray:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsArray;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsPointer:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsPointer;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorECCNotCorrectable:
+            return cudaErrorECCUncorrectable;
+        case hipErrorUnsupportedLimit:
+            return cudaErrorUnsupportedLimit;
+        case hipErrorContextAlreadyInUse:
+            return cudaErrorDeviceAlreadyInUse;
+        case hipErrorPeerAccessUnsupported:
+            return cudaErrorPeerAccessUnsupported;
+        case hipErrorInvalidKernelFile:
+            return cudaErrorInvalidPtx;
+        case hipErrorInvalidGraphicsContext:
+            return cudaErrorInvalidGraphicsContext;
+        case hipErrorInvalidSource:
+#if CUDA_VERSION >= 10010
+            return cudaErrorInvalidSource;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorFileNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorFileNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorSharedObjectSymbolNotFound:
+            return cudaErrorSharedObjectSymbolNotFound;
+        case hipErrorSharedObjectInitFailed:
+            return cudaErrorSharedObjectInitFailed;
+        case hipErrorOperatingSystem:
+            return cudaErrorOperatingSystem;
+        case hipErrorNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorSymbolNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorIllegalAddress:
+            return cudaErrorIllegalAddress;
+        case hipErrorLaunchTimeOut:
+            return cudaErrorLaunchTimeout;
+        case hipErrorSetOnActiveProcess:
+            return cudaErrorSetOnActiveProcess;
+        case hipErrorLaunchFailure:
+            return cudaErrorLaunchFailure;
+        case hipErrorCooperativeLaunchTooLarge:
+            return cudaErrorCooperativeLaunchTooLarge;
+        case hipErrorNotSupported:
+            return cudaErrorNotSupported;
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeMemory:
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeOther:
+        case hipErrorUnknown:
+        case hipErrorTbd:
+        default:
+            return cudaErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) {
+    switch (kind) {
+        case hipMemcpyHostToHost:
+            return cudaMemcpyHostToHost;
+        case hipMemcpyHostToDevice:
+            return cudaMemcpyHostToDevice;
+        case hipMemcpyDeviceToHost:
+            return cudaMemcpyDeviceToHost;
+        case hipMemcpyDeviceToDevice:
+            return cudaMemcpyDeviceToDevice;
+        default:
+            return cudaMemcpyDefault;
+    }
+}
+
+inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddressMode(
+    hipTextureAddressMode kind) {
+    switch (kind) {
+        case hipAddressModeWrap:
+            return cudaAddressModeWrap;
+        case hipAddressModeClamp:
+            return cudaAddressModeClamp;
+        case hipAddressModeMirror:
+            return cudaAddressModeMirror;
+        case hipAddressModeBorder:
+            return cudaAddressModeBorder;
+        default:
+            return cudaAddressModeWrap;
+    }
+}
+
+inline static enum cudaMemRangeAttribute hipMemRangeAttributeToCudaMemRangeAttribute(
+   hipMemRangeAttribute kind) {
+   switch (kind) {
+       case hipMemRangeAttributeReadMostly:
+           return cudaMemRangeAttributeReadMostly;
+       case hipMemRangeAttributePreferredLocation:
+           return cudaMemRangeAttributePreferredLocation;
+       case hipMemRangeAttributeAccessedBy:
+           return cudaMemRangeAttributeAccessedBy;
+       case hipMemRangeAttributeLastPrefetchLocation:
+           return cudaMemRangeAttributeLastPrefetchLocation;
+       default:
+           return cudaMemRangeAttributeReadMostly;
+   }
+}
+
+inline static enum cudaMemoryAdvise hipMemoryAdviseTocudaMemoryAdvise(
+    hipMemoryAdvise kind) {
+   switch (kind) {
+       case hipMemAdviseSetReadMostly:
+           return cudaMemAdviseSetReadMostly;
+       case hipMemAdviseUnsetReadMostly :
+           return cudaMemAdviseUnsetReadMostly ;
+       case hipMemAdviseSetPreferredLocation:
+           return cudaMemAdviseSetPreferredLocation;
+       case hipMemAdviseUnsetPreferredLocation:
+           return cudaMemAdviseUnsetPreferredLocation;
+       case hipMemAdviseSetAccessedBy:
+           return cudaMemAdviseSetAccessedBy;
+       case hipMemAdviseUnsetAccessedBy:
+           return cudaMemAdviseUnsetAccessedBy;
+       default:
+           return cudaMemAdviseSetReadMostly;
+   }
+}
+
+inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
+    hipTextureFilterMode kind) {
+    switch (kind) {
+        case hipFilterModePoint:
+            return cudaFilterModePoint;
+        case hipFilterModeLinear:
+            return cudaFilterModeLinear;
+        default:
+            return cudaFilterModePoint;
+    }
+}
+
+inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(hipTextureReadMode kind) {
+    switch (kind) {
+        case hipReadModeElementType:
+            return cudaReadModeElementType;
+        case hipReadModeNormalizedFloat:
+            return cudaReadModeNormalizedFloat;
+        default:
+            return cudaReadModeElementType;
+    }
+}
+
+inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormatKind(
+    hipChannelFormatKind kind) {
+    switch (kind) {
+        case hipChannelFormatKindSigned:
+            return cudaChannelFormatKindSigned;
+        case hipChannelFormatKindUnsigned:
+            return cudaChannelFormatKindUnsigned;
+        case hipChannelFormatKindFloat:
+            return cudaChannelFormatKindFloat;
+        case hipChannelFormatKindNone:
+            return cudaChannelFormatKindNone;
+        default:
+            return cudaChannelFormatKindNone;
+    }
+}
+
+/**
+ * Stream CallBack struct
+ */
+#define HIPRT_CB CUDART_CB
+typedef void(HIPRT_CB* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+inline static hipError_t hipInit(unsigned int flags) {
+    return hipCUResultTohipError(cuInit(flags));
+}
+
+inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); }
+
+inline static hipError_t hipGetLastError() { return hipCUDAErrorTohipError(cudaGetLastError()); }
+
+inline static hipError_t hipPeekAtLastError() {
+    return hipCUDAErrorTohipError(cudaPeekAtLastError());
+}
+
+inline static hipError_t hipMalloc(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
+}
+
+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
+inline static hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr,size_t* pitch,size_t widthInBytes,size_t height,unsigned int elementSizeBytes){
+    return hipCUResultTohipError(cuMemAllocPitch(dptr,pitch,widthInBytes,height,elementSizeBytes));
+}
+
+inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
+    return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent));
+}
+
+inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
+
+inline static hipError_t hipMallocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMallocHost(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
+}
+
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
+    return hipCUResultTohipError(cuMemAllocHost(ptr, size));
+}
+
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice,
+                                      int device) {
+    return hipCUDAErrorTohipError(cudaMemAdvise(dev_ptr, count,
+        hipMemoryAdviseTocudaMemoryAdvise(advice), device));
+}
+
+inline static hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
+                                             hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemPrefetchAsync(dev_ptr, count, device, stream));
+}
+
+inline static hipError_t hipMemRangeGetAttribute(void* data, size_t data_size,
+                                                 hipMemRangeAttribute attribute,
+                                                 const void* dev_ptr, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttribute(data, data_size,
+        hipMemRangeAttributeToCudaMemRangeAttribute(attribute), dev_ptr, count));
+}
+
+inline static hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
+                                                  hipMemRangeAttribute* attributes,
+                                                  size_t num_attributes, const void* dev_ptr,
+                                                  size_t count) {
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, attributes,
+        num_attributes, dev_ptr, count));
+}
+
+inline static hipError_t hipStreamAttachMemAsync(hipStream_t stream, hipDeviceptr_t* dev_ptr,
+                                                 size_t length __dparm(0),
+                                                 unsigned int flags __dparm(hipMemAttachSingle)) {
+    return hipCUDAErrorTohipError(cudaStreamAttachMemAsync(stream, dev_ptr, length, flags));
+}
+
+inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
+                                        size_t width, size_t height,
+                                        unsigned int flags __dparm(hipArrayDefault)) {
+    return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
+}
+
+inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
+                             hipExtent extent, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
+}
+
+inline static hipError_t hipFreeArray(hipArray* array) {
+    return hipCUDAErrorTohipError(cudaFreeArray(array));
+}
+
+inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
+}
+
+inline static hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
+    return hipCUDAErrorTohipError(cudaHostGetFlags(flagsPtr, hostPtr));
+}
+
+inline static hipError_t hipHostRegister(void* ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostRegister(ptr, size, flags));
+}
+
+inline static hipError_t hipHostUnregister(void* ptr) {
+    return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
+}
+
+inline static hipError_t hipFreeHost(void* ptr)
+    __attribute__((deprecated("use hipHostFree instead")));
+inline static hipError_t hipFreeHost(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipHostFree(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipSetDevice(int device) {
+    return hipCUDAErrorTohipError(cudaSetDevice(device));
+}
+
+inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
+    struct cudaDeviceProp cdprop;
+    memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
+    cdprop.major = prop->major;
+    cdprop.minor = prop->minor;
+    cdprop.totalGlobalMem = prop->totalGlobalMem;
+    cdprop.sharedMemPerBlock = prop->sharedMemPerBlock;
+    cdprop.regsPerBlock = prop->regsPerBlock;
+    cdprop.warpSize = prop->warpSize;
+    cdprop.maxThreadsPerBlock = prop->maxThreadsPerBlock;
+    cdprop.clockRate = prop->clockRate;
+    cdprop.totalConstMem = prop->totalConstMem;
+    cdprop.multiProcessorCount = prop->multiProcessorCount;
+    cdprop.l2CacheSize = prop->l2CacheSize;
+    cdprop.maxThreadsPerMultiProcessor = prop->maxThreadsPerMultiProcessor;
+    cdprop.computeMode = prop->computeMode;
+    cdprop.canMapHostMemory = prop->canMapHostMemory;
+    cdprop.memoryClockRate = prop->memoryClockRate;
+    cdprop.memoryBusWidth = prop->memoryBusWidth;
+    return hipCUDAErrorTohipError(cudaChooseDevice(device, &cdprop));
+}
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoHAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
+                                   hipMemcpyKind copyKind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
+}
+
+
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
+				      size_t sizeBytes, hipMemcpyKind copyKind,
+				      hipStream_t stream) {
+	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
+										hipMemcpyKindToCudaMemcpyKind(copyKind),
+										stream);
+	
+	if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
+	
+	return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
+                                        hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
+}
+
+inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+                                           size_t offset __dparm(0),
+                                           hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
+                                                     hipMemcpyKindToCudaMemcpyKind(copyType)));
+}
+
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+                                                size_t sizeBytes, size_t offset,
+                                                hipMemcpyKind copyType,
+                                                hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
+        symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
+}
+
+inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
+                                             size_t offset __dparm(0),
+                                             hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
+                                                       hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
+                                                  size_t sizeBytes, size_t offset,
+                                                  hipMemcpyKind kind,
+                                                  hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
+        dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolAddress(devPtr, symbolName));
+}
+
+inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolSize(size, symbolName));
+}
+
+inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                     size_t width, size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
+  return hipCUResultTohipError(cuMemcpy2D(pCopy));
+}
+
+inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
+  return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
+    return hipCUDAErrorTohipError(cudaMemcpy3D(p));
+}
+
+inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
+}
+
+inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
+    return hipCUResultTohipError(cuMemcpy3D(pCopy));
+}
+
+inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpy3DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                          size_t width, size_t height, hipMemcpyKind kind,
+                                          hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
+                                                    hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
+                                              size_t wOffset, size_t hOffset, size_t width,
+                                              size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
+                                                        height,
+                                                        hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
+                                                   size_t wOffset, size_t hOffset, size_t width,
+                                                   size_t height, hipMemcpyKind kind,
+                                                   hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset,
+                                                             width, height,
+                                                             hipMemcpyKindToCudaMemcpyKind(kind),
+                                                             stream));
+}
+
+inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
+                                            const void* src, size_t spitch, size_t width,
+                                            size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
+                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
+                                                 const void* src, size_t spitch, size_t width,
+                                                 size_t height, hipMemcpyKind kind,
+                                                 hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch,
+                                                           width, height,
+                                                           hipMemcpyKindToCudaMemcpyKind(kind),
+                                                           stream));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+                                                           size_t hOffset, const void* src,
+                                                           size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
+                                                             size_t wOffset, size_t hOffset,
+                                                             size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
+                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
+inline static hipError_t hipDeviceSynchronize() {
+    return hipCUDAErrorTohipError(cudaDeviceSynchronize());
+}
+
+inline static hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* pCacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceGetCacheConfig(pCacheConfig));
+}
+
+inline static hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
+    return hipCUDAErrorTohipError(cudaFuncSetAttribute(func, attr, value));
+}
+
+inline static hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaFuncSetSharedMemConfig(func, config));
+}
+
+inline static const char* hipGetErrorString(hipError_t error) {
+    return cudaGetErrorString(hipErrorToCudaError(error));
+}
+
+inline static const char* hipGetErrorName(hipError_t error) {
+    return cudaGetErrorName(hipErrorToCudaError(error));
+}
+
+inline static hipError_t hipGetDeviceCount(int* count) {
+    return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
+}
+
+inline static hipError_t hipGetDevice(int* device) {
+    return hipCUDAErrorTohipError(cudaGetDevice(device));
+}
+
+inline static hipError_t hipIpcCloseMemHandle(void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr));
+}
+
+inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event));
+}
+
+inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr));
+}
+
+inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+    return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle));
+}
+
+inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle,
+                                             unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags));
+}
+
+inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
+    return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
+                                        hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes,
+                                          hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD8Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD16(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD16Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMemset2D(dst, pitch, value, width, height));
+}
+
+inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream));
+}
+
+inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ){
+    return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent));
+}
+
+inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent, hipStream_t stream __dparm(0) ){
+    return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream));
+}
+
+inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
+    struct cudaDeviceProp cdprop;
+    cudaError_t cerror;
+    cerror = cudaGetDeviceProperties(&cdprop, device);
+
+    strncpy(p_prop->name, cdprop.name, 256);
+    p_prop->totalGlobalMem = cdprop.totalGlobalMem;
+    p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
+    p_prop->regsPerBlock = cdprop.regsPerBlock;
+    p_prop->warpSize = cdprop.warpSize;
+    p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
+    for (int i = 0; i < 3; i++) {
+        p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
+        p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
+    }
+    p_prop->clockRate = cdprop.clockRate;
+    p_prop->memoryClockRate = cdprop.memoryClockRate;
+    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
+    p_prop->totalConstMem = cdprop.totalConstMem;
+    p_prop->major = cdprop.major;
+    p_prop->minor = cdprop.minor;
+    p_prop->multiProcessorCount = cdprop.multiProcessorCount;
+    p_prop->l2CacheSize = cdprop.l2CacheSize;
+    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
+    p_prop->computeMode = cdprop.computeMode;
+    p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
+
+    int ccVers = p_prop->major * 100 + p_prop->minor * 10;
+    p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
+    p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
+    p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
+    p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
+    p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
+    p_prop->arch.hasDoubles = (ccVers >= 130);
+    p_prop->arch.hasWarpVote = (ccVers >= 120);
+    p_prop->arch.hasWarpBallot = (ccVers >= 200);
+    p_prop->arch.hasWarpShuffle = (ccVers >= 300);
+    p_prop->arch.hasFunnelShift = (ccVers >= 350);
+    p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
+    p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
+    p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
+    p_prop->arch.has3dGrid = (ccVers >= 200);
+    p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
+
+    p_prop->concurrentKernels = cdprop.concurrentKernels;
+    p_prop->pciDomainID = cdprop.pciDomainID;
+    p_prop->pciBusID = cdprop.pciBusID;
+    p_prop->pciDeviceID = cdprop.pciDeviceID;
+    p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
+    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
+    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+    p_prop->gcnArch = 0; // Not a GCN arch
+    p_prop->integrated = cdprop.integrated;
+    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
+    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
+
+    p_prop->maxTexture1D    = cdprop.maxTexture1D;
+    p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
+    p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
+    p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
+    p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
+    p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
+
+    p_prop->memPitch                 = cdprop.memPitch;
+    p_prop->textureAlignment         = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
+    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
+    p_prop->ECCEnabled               = cdprop.ECCEnabled;
+    p_prop->tccDriver                = cdprop.tccDriver;
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+    enum cudaDeviceAttr cdattr;
+    cudaError_t cerror;
+
+    switch (attr) {
+        case hipDeviceAttributeMaxThreadsPerBlock:
+            cdattr = cudaDevAttrMaxThreadsPerBlock;
+            break;
+        case hipDeviceAttributeMaxBlockDimX:
+            cdattr = cudaDevAttrMaxBlockDimX;
+            break;
+        case hipDeviceAttributeMaxBlockDimY:
+            cdattr = cudaDevAttrMaxBlockDimY;
+            break;
+        case hipDeviceAttributeMaxBlockDimZ:
+            cdattr = cudaDevAttrMaxBlockDimZ;
+            break;
+        case hipDeviceAttributeMaxGridDimX:
+            cdattr = cudaDevAttrMaxGridDimX;
+            break;
+        case hipDeviceAttributeMaxGridDimY:
+            cdattr = cudaDevAttrMaxGridDimY;
+            break;
+        case hipDeviceAttributeMaxGridDimZ:
+            cdattr = cudaDevAttrMaxGridDimZ;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerBlock:
+            cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
+            break;
+        case hipDeviceAttributeTotalConstantMemory:
+            cdattr = cudaDevAttrTotalConstantMemory;
+            break;
+        case hipDeviceAttributeWarpSize:
+            cdattr = cudaDevAttrWarpSize;
+            break;
+        case hipDeviceAttributeMaxRegistersPerBlock:
+            cdattr = cudaDevAttrMaxRegistersPerBlock;
+            break;
+        case hipDeviceAttributeClockRate:
+            cdattr = cudaDevAttrClockRate;
+            break;
+        case hipDeviceAttributeMemoryClockRate:
+            cdattr = cudaDevAttrMemoryClockRate;
+            break;
+        case hipDeviceAttributeMemoryBusWidth:
+            cdattr = cudaDevAttrGlobalMemoryBusWidth;
+            break;
+        case hipDeviceAttributeMultiprocessorCount:
+            cdattr = cudaDevAttrMultiProcessorCount;
+            break;
+        case hipDeviceAttributeComputeMode:
+            cdattr = cudaDevAttrComputeMode;
+            break;
+        case hipDeviceAttributeL2CacheSize:
+            cdattr = cudaDevAttrL2CacheSize;
+            break;
+        case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+            cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMajor:
+            cdattr = cudaDevAttrComputeCapabilityMajor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMinor:
+            cdattr = cudaDevAttrComputeCapabilityMinor;
+            break;
+        case hipDeviceAttributeConcurrentKernels:
+            cdattr = cudaDevAttrConcurrentKernels;
+            break;
+        case hipDeviceAttributePciBusId:
+            cdattr = cudaDevAttrPciBusId;
+            break;
+        case hipDeviceAttributePciDeviceId:
+            cdattr = cudaDevAttrPciDeviceId;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+            cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
+            break;
+        case hipDeviceAttributeIsMultiGpuBoard:
+            cdattr = cudaDevAttrIsMultiGpuBoard;
+            break;
+        case hipDeviceAttributeIntegrated:
+            cdattr = cudaDevAttrIntegrated;
+            break;
+        case hipDeviceAttributeMaxTexture1DWidth:
+            cdattr = cudaDevAttrMaxTexture1DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DWidth:
+            cdattr = cudaDevAttrMaxTexture2DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DHeight:
+            cdattr = cudaDevAttrMaxTexture2DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DWidth:
+            cdattr = cudaDevAttrMaxTexture3DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture3DHeight:
+            cdattr = cudaDevAttrMaxTexture3DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DDepth:
+            cdattr = cudaDevAttrMaxTexture3DDepth;
+            break;
+        case hipDeviceAttributeMaxPitch:
+            cdattr = cudaDevAttrMaxPitch;
+            break;
+        case hipDeviceAttributeTextureAlignment:
+            cdattr = cudaDevAttrTextureAlignment;
+            break;
+        case hipDeviceAttributeTexturePitchAlignment:
+            cdattr = cudaDevAttrTexturePitchAlignment;
+            break;
+        case hipDeviceAttributeKernelExecTimeout:
+            cdattr = cudaDevAttrKernelExecTimeout;
+            break;
+        case hipDeviceAttributeCanMapHostMemory:
+            cdattr = cudaDevAttrCanMapHostMemory;
+            break;
+        case hipDeviceAttributeEccEnabled:
+            cdattr = cudaDevAttrEccEnabled;
+            break;
+        case hipDeviceAttributeCooperativeLaunch:
+            cdattr = cudaDevAttrCooperativeLaunch;
+            break;
+        case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+            cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
+            break;
+        case hipDeviceAttributeConcurrentManagedAccess:
+            cdattr = cudaDevAttrConcurrentManagedAccess;
+            break;
+        case hipDeviceAttributeManagedMemory:
+            cdattr = cudaDevAttrManagedMemory;
+            break;
+        case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
+            cdattr = cudaDevAttrPageableMemoryAccessUsesHostPageTables;
+            break;
+        case hipDeviceAttributePageableMemoryAccess:
+            cdattr = cudaDevAttrPageableMemoryAccess;
+            break;
+        case hipDeviceAttributeDirectManagedMemAccessFromHost:
+            cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
+            break;
+        default:
+            return hipCUDAErrorTohipError(cudaErrorInvalidValue);
+    }
+
+    cerror = cudaDeviceGetAttribute(pi, cdattr, device);
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                              blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize,
+                                                                      unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                      blockSize, dynamicSMemSize, flags));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
+                                                                 hipFunction_t f,
+                                                                 int  blockSize,
+                                                                 size_t dynamicSMemSize ){
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f,
+                                                                   blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                          hipFunction_t f,
+                                                                          int  blockSize,
+                                                                          size_t dynamicSMemSize,
+                                                                          unsigned int  flags ) {
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f,
+                                                                blockSize, dynamicSMemSize, flags));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit, unsigned int  flags){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit, flags));
+}
+
+inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
+    struct cudaPointerAttributes cPA;
+    hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
+    if (err == hipSuccess) {
+#if (CUDART_VERSION >= 11000)
+        auto memType = cPA.type;
+#else
+        unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
+#endif
+        switch (memType) {
+            case cudaMemoryTypeDevice:
+                attributes->memoryType = hipMemoryTypeDevice;
+                break;
+            case cudaMemoryTypeHost:
+                attributes->memoryType = hipMemoryTypeHost;
+                break;
+            default:
+                return hipErrorUnknown;
+        }
+        attributes->device = cPA.device;
+        attributes->devicePointer = cPA.devicePointer;
+        attributes->hostPointer = cPA.hostPointer;
+        attributes->isManaged = 0;
+        attributes->allocationFlags = 0;
+    }
+    return err;
+}
+
+inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
+    return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
+}
+
+inline static hipError_t hipEventCreate(hipEvent_t* event) {
+    return hipCUDAErrorTohipError(cudaEventCreate(event));
+}
+
+inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __dparm(NULL)) {
+    return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
+}
+
+inline static hipError_t hipEventSynchronize(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventSynchronize(event));
+}
+
+inline static hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
+    return hipCUDAErrorTohipError(cudaEventElapsedTime(ms, start, stop));
+}
+
+inline static hipError_t hipEventDestroy(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventDestroy(event));
+}
+
+inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
+}
+
+inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+    return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
+}
+
+inline static hipError_t hipStreamCreate(hipStream_t* stream) {
+    return hipCUDAErrorTohipError(cudaStreamCreate(stream));
+}
+
+inline static hipError_t hipStreamSynchronize(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipStreamDestroy(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
+}
+
+inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
+    return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
+    return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
+}
+
+inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
+                                            unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamWaitEvent(stream, event, flags));
+}
+
+inline static hipError_t hipStreamQuery(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamQuery(stream));
+}
+
+inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback,
+                                              void* userData, unsigned int flags) {
+    return hipCUDAErrorTohipError(
+        cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
+}
+
+inline static hipError_t hipDriverGetVersion(int* driverVersion) {
+    cudaError_t err = cudaDriverGetVersion(driverVersion);
+
+    // Override driver version to match version reported on HCC side.
+    *driverVersion = 4;
+
+    return hipCUDAErrorTohipError(err);
+}
+
+inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
+    return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion));
+}
+
+inline static hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
+}
+
+inline static hipError_t hipDeviceDisablePeerAccess(int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice));
+}
+
+inline static hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags));
+}
+
+inline static hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
+    return hipCUResultTohipError(cuCtxDisablePeerAccess(peerCtx));
+}
+
+inline static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
+    return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags));
+}
+
+inline static hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags,
+                                                     int* active) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxGetState(dev, flags, active));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRelease(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRetain(pctx, dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxReset(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxSetFlags(dev, flags));
+}
+
+inline static hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize,
+                                               hipDeviceptr_t dptr) {
+    return hipCUResultTohipError(cuMemGetAddressRange(pbase, psize, dptr));
+}
+
+inline static hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
+                                       size_t count) {
+    return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count));
+}
+
+inline static hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
+                                            int srcDevice, size_t count,
+                                            hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream));
+}
+
+// Profile APIs:
+inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cudaProfilerStart()); }
+
+inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
+
+inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
+    return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
+}
+
+inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
+}
+
+inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaEventCreateWithFlags(event, flags));
+}
+
+inline static hipError_t hipEventQuery(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventQuery(event));
+}
+
+inline static hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) {
+    return hipCUResultTohipError(cuCtxCreate(ctx, flags, device));
+}
+
+inline static hipError_t hipCtxDestroy(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDestroy(ctx));
+}
+
+inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxPopCurrent(ctx));
+}
+
+inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxPushCurrent(ctx));
+}
+
+inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxSetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxGetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetDevice(hipDevice_t* device) {
+    return hipCUResultTohipError(cuCtxGetDevice(device));
+}
+
+inline static hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+    return hipCUResultTohipError(cuCtxGetApiVersion(ctx, (unsigned int*)apiVersion));
+}
+
+inline static hipError_t hipCtxGetCacheConfig(hipFuncCache* cacheConfig) {
+    return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetCacheConfig(hipFuncCache cacheConfig) {
+    return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
+}
+
+inline static hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig*)pConfig));
+}
+
+inline static hipError_t hipCtxSynchronize(void) {
+    return hipCUResultTohipError(cuCtxSynchronize());
+}
+
+inline static hipError_t hipCtxGetFlags(unsigned int* flags) {
+    return hipCUResultTohipError(cuCtxGetFlags(flags));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceComputeCapability(major, minor, device));
+}
+
+inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceGetName(name, len, device));
+}
+
+inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                                  int srcDevice, int dstDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
+}
+
+inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
+    return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
+}
+
+inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) {
+    return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
+}
+
+inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* config) {
+    return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
+    return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
+}
+
+inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod) {
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
+                                              const char* kname) {
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
+    hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
+}
+
+inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
+    return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
+}
+
+inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+    return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+                                            const char* name) {
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
+                                             unsigned int numOptions, hipJitOption* options,
+                                             void** optionValues) {
+    return hipCUResultTohipError(
+        cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
+inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
+					 dim3 dimBlocks, void** args, size_t sharedMemBytes,
+					 hipStream_t stream)
+{
+   return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                                               unsigned int gridDimY, unsigned int gridDimZ,
+                                               unsigned int blockDimX, unsigned int blockDimY,
+                                               unsigned int blockDimZ, unsigned int sharedMemBytes,
+                                               hipStream_t stream, void** kernelParams,
+                                               void** extra) {
+    return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX,
+                                                blockDimY, blockDimZ, sharedMemBytes, stream,
+                                                kernelParams, extra));
+}
+
+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
+                                                         struct textureReference* tex,
+                                                         const void* devPtr,
+                                                         const hipChannelFormatDesc* desc,
+                                                         size_t size __dparm(UINT_MAX)) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
+    size_t* offset, struct textureReference* tex, const void* devPtr,
+    const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
+    return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                        hipChannelFormatKind f) {
+    return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
+}
+
+inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
+                                                const hipResourceDesc* pResDesc,
+                                                const hipTextureDesc* pTexDesc,
+                                                const hipResourceViewDesc* pResViewDesc) {
+    return hipCUDAErrorTohipError(
+        cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+inline static hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaDestroyTextureObject(textureObject));
+}
+
+inline static hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
+                                                const hipResourceDesc* pResDesc) {
+    return hipCUDAErrorTohipError(cudaCreateSurfaceObject(pSurfObject, pResDesc));
+}
+
+inline static hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
+    return hipCUDAErrorTohipError(cudaDestroySurfaceObject(surfaceObject));
+}
+
+inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+                                           hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
+    size_t* offset, const struct textureReference* texref) {
+    return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
+}
+
+inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
+{
+    return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
+}
+
+inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags) {
+    return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __CUDACC__
+
+template<class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      T func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                            blockSize, dynamicSMemSize));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0, unsigned int  flags = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit, flags));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func,
+                                              int  blockSize, size_t dynamicSMemSize,unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                                 blockSize, dynamicSMemSize, flags));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, const hipChannelFormatDesc& desc,
+                                        size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+    const hipChannelFormatDesc& desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>* tex, hipArray_const_t array,
+    const hipChannelFormatDesc* desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
+}
+
+template <class T>
+inline static hipChannelFormatDesc hipCreateChannelDesc() {
+    return cudaCreateChannelDesc<T>();
+}
+
+template <class T>
+inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+    return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
+}
+
+inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
+    return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
+}
+
+inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
+   return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
+}
+
+inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
+   return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
+}
+
+inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
+   return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
+}
+
+inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
+}
+
+inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+}
+
+inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+}
+
+inline static hipError_t hipArrayDestroy(hiparray hArray){
+   return hipCUResultTohipError(cuArrayDestroy(hArray));
+}
+
+inline static hipError_t hipArray3DCreate(hiparray* pHandle,
+                                          const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+}
+
+#endif  //__CUDACC__
+
+#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
diff --git a/include/hip/nvidia_detail/hip_texture_types.h b/include/hip/nvidia_detail/hip_texture_types.h
new file mode 100644
index 0000000000..df374d705a
--- /dev/null
+++ b/include/hip/nvidia_detail/hip_texture_types.h
@@ -0,0 +1,6 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+
+#include <texture_types.h>
+
+#endif
diff --git a/include/hip/nvidia_detail/hiprtc.h b/include/hip/nvidia_detail/hiprtc.h
new file mode 100644
index 0000000000..449ba26c0f
--- /dev/null
+++ b/include/hip/nvidia_detail/hiprtc.h
@@ -0,0 +1,168 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push(default)
+#endif
+
+typedef enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
+
+inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
+  switch (result) {
+    case HIPRTC_SUCCESS:
+      return NVRTC_SUCCESS;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      return NVRTC_ERROR_OUT_OF_MEMORY;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      return NVRTC_ERROR_INVALID_INPUT;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      return NVRTC_ERROR_INVALID_PROGRAM;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      return NVRTC_ERROR_INVALID_OPTION;
+    case HIPRTC_ERROR_COMPILATION:
+      return NVRTC_ERROR_COMPILATION;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
+  switch (result) {
+    case NVRTC_SUCCESS:
+      return HIPRTC_SUCCESS;
+    case NVRTC_ERROR_OUT_OF_MEMORY:
+      return HIPRTC_ERROR_OUT_OF_MEMORY;
+    case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case NVRTC_ERROR_INVALID_INPUT:
+      return HIPRTC_ERROR_INVALID_INPUT;
+    case NVRTC_ERROR_INVALID_PROGRAM:
+      return HIPRTC_ERROR_INVALID_PROGRAM;
+    case NVRTC_ERROR_INVALID_OPTION:
+      return HIPRTC_ERROR_INVALID_OPTION;
+    case NVRTC_ERROR_COMPILATION:
+      return HIPRTC_ERROR_COMPILATION;
+    case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case NVRTC_ERROR_INTERNAL_ERROR:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+const char* hiprtcGetErrorString(hiprtcResult result) {
+  return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor) {
+  return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
+}
+
+typedef nvrtcProgram hiprtcProgram;
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+  return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
+}
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+  return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
+}
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+                                 int numHeaders, const char** headers, const char** includeNames) {
+  return nvrtcResultTohiprtcResult(
+      nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+  return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
+}
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+                                  const char** lowered_name) {
+  return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
+}
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif  // HIPRTC_H
diff --git a/include/hip/texture_types.h b/include/hip/texture_types.h
index 07cd9833a9..4088d67af4 100644
--- a/include/hip/texture_types.h
+++ b/include/hip/texture_types.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <hip/hip_common.h>
 
 #if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
-#include <hip/amd_detail/amd_texture_types.h>
+#include <hip/amd_detail/texture_types.h>
 #elif !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
 #include "texture_types.h"
 #else
diff --git a/packaging/hip-base.postinst b/packaging/hip-base.postinst
index 5df8adab62..ebc04c05b8 100755
--- a/packaging/hip-base.postinst
+++ b/packaging/hip-base.postinst
@@ -45,5 +45,4 @@ cd $CURRENTDIR
 # The following will be removed after upstream updation
 cd $HIPINCDIR
     ln -r -s -f amd_detail hcc_detail
-    ln -r -s -f nvidia_detail nvcc_detail
-cd $CURRENTDIR
+cd $CURRENTDIR
\ No newline at end of file
diff --git a/packaging/hip-base.prerm b/packaging/hip-base.prerm
index 07a2bbf35d..895645f21a 100755
--- a/packaging/hip-base.prerm
+++ b/packaging/hip-base.prerm
@@ -41,7 +41,6 @@ rmdir --ignore-fail-on-non-empty $ROCMBINDIR
 HIPINCDIR=$HIPDIR/include
 cd $HIPINCDIR/hip
 rm hcc_detail
-rm nvcc_detail
 cd $CURRENTDIR
 
 ROCMINCDIR=$ROCMDIR/include
diff --git a/packaging/hip-base.txt b/packaging/hip-base.txt
index b3fd6aaf6e..82d30dc92f 100644
--- a/packaging/hip-base.txt
+++ b/packaging/hip-base.txt
@@ -21,15 +21,9 @@
 cmake_minimum_required(VERSION 2.8.3)
 project(hip_base)
 
-if(WIN32)
-  install(DIRECTORY @hip_SOURCE_DIR@/bin DESTINATION . USE_SOURCE_PERMISSIONS)
-else()
-  install(DIRECTORY @hip_SOURCE_DIR@/bin DESTINATION . USE_SOURCE_PERMISSIONS
-          PATTERN *.bat EXCLUDE)
-endif()
-install(DIRECTORY @hip_SOURCE_DIR@/include DESTINATION .)
-install(DIRECTORY @HIP_AMD_BACKEND_SOURCE_DIR@/include/hip/amd_detail DESTINATION include/hip)
-install(DIRECTORY @HIP_AMD_BACKEND_SOURCE_DIR@/include/hip/nvidia_detail DESTINATION include/hip)
+install(DIRECTORY @hip_SOURCE_DIR@/bin DESTINATION . USE_SOURCE_PERMISSIONS)
+# The following 'ATTERN "hcc_detail" EXCLUDE' will be removed after upstream updation
+install(DIRECTORY @hip_SOURCE_DIR@/include DESTINATION . PATTERN "hcc_detail" EXCLUDE)
 install(FILES @PROJECT_BINARY_DIR@/include/hip/amd_detail/hip_prof_str.h
         DESTINATION include/hip/amd_detail)
 install(FILES @PROJECT_BINARY_DIR@/include/hip/hip_version.h
diff --git a/packaging/hip-rocclr.txt b/packaging/hip-rocclr.txt
index ccff0d0358..1a8a38f5db 100644
--- a/packaging/hip-rocclr.txt
+++ b/packaging/hip-rocclr.txt
@@ -31,7 +31,29 @@ endif()
 
 install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib)
 install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip)
-install(FILES @PROJECT_BINARY_DIR@/src/hipamd/hip-lang-config.cmake @PROJECT_BINARY_DIR@/src/hipamd/hip-lang-config-version.cmake DESTINATION lib/cmake/hip-lang)
+install(FILES @PROJECT_BINARY_DIR@/rocclr/hip-lang-config.cmake @PROJECT_BINARY_DIR@/rocclr/hip-lang-config-version.cmake DESTINATION lib/cmake/hip-lang)
+
+if(@__HIP_ENABLE_RTC@)
+  install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc-builtins.so.@HIP_LIB_VERSION_MAJOR@.@HIP_LIB_VERSION_MINOR@ DESTINATION lib)
+endif()
+
+#############################
+# Rocclr install
+#############################
+
+set(ROCclr_BUILD_DIR "@ROCclr_DIR@/../../..")
+set(ROCclr_LIB_DIR "@ROCM_PATH@/rocclr/lib")
+set(ROCclr_CMAKE_DDIR "@ROCM_PATH@/rocclr/lib/cmake/rocclr")
+set(ROCclr_CMAKE_SDIR "${ROCclr_BUILD_DIR}/CMakeFiles/Export/lib/cmake/rocclr")
+
+install(FILES ${ROCclr_BUILD_DIR}/libamdrocclr_static.a DESTINATION ${ROCclr_LIB_DIR})
+install(FILES @ROCclr_DIR@/ROCclrConfig.cmake DESTINATION ${ROCclr_CMAKE_DDIR})
+install(FILES @ROCclr_DIR@/rocclr-config-version.cmake DESTINATION ${ROCclr_CMAKE_DDIR})
+file(GLOB _rocclr_target_files ${ROCclr_CMAKE_SDIR}/rocclr-targets*.cmake)
+foreach(_rocclr_target_file ${_rocclr_target_files})
+  message(STATUS "_rocclr_target_file: ${_rocclr_target_file}")
+endforeach()
+install(FILES ${_rocclr_target_files} DESTINATION ${ROCclr_CMAKE_DDIR})
 
 #############################
 # Packaging steps
diff --git a/rocclr/CMakeLists.txt b/rocclr/CMakeLists.txt
new file mode 100755
index 0000000000..e503b1a205
--- /dev/null
+++ b/rocclr/CMakeLists.txt
@@ -0,0 +1,303 @@
+# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+#project("hip")
+cmake_minimum_required(VERSION 3.5.1)
+
+option(ADDRESS_SANITIZER "Build Address Sanitizer" OFF)
+if (ADDRESS_SANITIZER)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+    if (BUILD_SHARED_LIBS)
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" -fsanitize=address)
+        set (CMAKE_SHARED_LINKER_FLAGS  "${CMAKE_SHARED_LINKER_FLAGS}" -shared-libsan)
+    else ()
+        set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS}" -fsanitize=address)
+        set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS}" -static-libsan)
+    endif ()
+endif ()
+
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-keep-memory -Wl,-Bsymbolic -Wl,--unresolved-symbols=report-all -Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/hip_hcc.map.in")
+
+set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY  ${CMAKE_BINARY_DIR}/lib)
+
+set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)
+set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip)
+
+find_package(PythonInterp REQUIRED)
+
+add_definitions( -D__HIP_PLATFORM_AMD__ -DLINUX -D__x86_64__ -D__AMD64__ -DUNIX_OS -DqLittleEndian -DOPENCL_MAJOR=2 -DOPENCL_MINOR=0 -DCL_TARGET_OPENCL_VERSION=220 -DWITH_AQL -DWITH_ONLINE_COMPILER -DATI_OS_LINUX -DATI_ARCH_X86 -DLITTLEENDIAN_CPU -DATI_BITS_64 -DATI_COMP_GCC -DWITH_HSA_DEVICE -DWITH_TARGET_AMDGCN -DOPENCL_EXPORTS -DCL_USE_DEPRECATED_OPENCL_1_0_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_2_0_APIS -DVEGA10_ONLY=false -DWITH_LIGHTNING_COMPILER -DUSE_PROF_API)
+
+if(CMAKE_BUILD_TYPE MATCHES "^Debug$")
+  add_definitions(-DDEBUG)
+endif()
+
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
+   (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang"))
+  add_definitions(
+    # Enabling -Wextra or -pedantic will cause
+    # thousands of warnings. Keep things simple for now.
+    -Wall
+    # This one seems impossible to fix for now.
+    # There are hundreds of instances of unused vars/functions
+    # throughout the code base.
+    -Wno-unused-variable
+    -Wno-unused-function)
+endif()
+
+set(USE_PROF_API "1")
+
+# Need to add /opt/rocm/llvm to package search path since ROCclr will
+# find package amd_comgr and amd_comgr will find package llvm/clang.
+# Without this, the system llvm/clang at /usr/local may be found.
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "/opt/rocm/llvm")
+find_package(ROCclr REQUIRED CONFIG)
+
+#############################
+# Profiling API support
+#############################
+# Generate profiling API macros/structures header
+set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h")
+set(PROF_API_HDR "${PROJECT_SOURCE_DIR}/include/hip/amd_detail/hip_runtime_api.h")
+set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
+set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py")
+set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt")
+
+add_custom_command(OUTPUT ${PROF_API_STR}
+  COMMAND ${PYTHON_EXECUTABLE} ${PROF_API_GEN} -v -t --priv ${OPT_PROF_API} ${PROF_API_HDR} ${PROF_API_SRC} ${PROF_API_STR}
+  OUTPUT_FILE ${PROF_API_LOG}
+  DEPENDS ${PROF_API_HDR} ${PROF_API_GEN}
+  COMMENT "Generating profiling primitives: ${PROF_API_STR}")
+
+add_custom_target(gen-prof-api-str-header ALL
+  DEPENDS ${PROF_API_STR}
+  SOURCES ${PROF_API_HDR})
+
+# Enable profiling API
+if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROCclr_STATIC_DIR )
+	#	message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n")
+
+endif()
+
+#APPEND default path for CMAKE_PREFIX_PATH
+#User provided will be searched first since defualt path is at end.
+#Custom install path can be provided at compile time as cmake parameter(-DCMAKE_PREFIX_PATH="")
+#/opt/rocm:default:For amd_comgr,hsa-runtime64
+#/opt/rocm/llvm/:default:For llvm/clang pulled in as dependency from hsa/comgr
+list( APPEND CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "/opt/rocm")
+
+list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" )
+set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+message(STATUS "Hsa runtime found at ${hsa-runtime64_DIR}.")
+message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
+
+find_package(LLVM REQUIRED CONFIG
+   PATHS
+     /opt/rocm/llvm
+   PATH_SUFFIXES
+     lib/cmake/llvm)
+
+message(STATUS "llvm found at ${LLVM_DIR}.")
+set(LLVM_ROOT "${LLVM_DIR}/../../..")
+
+add_library(hip64 OBJECT
+ hip_context.cpp
+ hip_code_object.cpp
+ hip_device.cpp
+ hip_device_runtime.cpp
+ hip_error.cpp
+ hip_event.cpp
+ hip_fatbin.cpp
+ hip_global.cpp
+ hip_hmm.cpp
+ hip_memory.cpp
+ hip_module.cpp
+ hip_peer.cpp
+ hip_platform.cpp
+ hip_profile.cpp
+ hip_stream.cpp
+ hip_stream_ops.cpp
+ hip_surface.cpp
+ hip_texture.cpp
+ hip_activity.cpp
+ hip_intercept.cpp
+ hip_rtc.cpp
+ cl_gl.cpp
+ cl_lqdflash_amd.cpp
+ fixme.cpp
+ hip_graph.cpp
+ hip_graph_internal.cpp
+ )
+set_target_properties(hip64 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(hip64
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_BINARY_DIR}/include
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/amdocl
+    ${ROCR_INCLUDES}
+    $<TARGET_PROPERTY:amdrocclr_static,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:amd_comgr,INTERFACE_INCLUDE_DIRECTORIES>)
+target_compile_definitions(hip64
+  PRIVATE
+   $<TARGET_PROPERTY:amd_comgr,INTERFACE_COMPILE_DEFINITIONS>)
+
+if(ROCclr_FOUND)
+  message(STATUS "ROCclr found at ${ROCclr_DIR}")
+  target_include_directories(hip64
+    PRIVATE
+      $<TARGET_PROPERTY:amdrocclr_static,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_compile_definitions(hip64
+    PRIVATE
+      $<TARGET_PROPERTY:amdrocclr_static,INTERFACE_COMPILE_DEFINITIONS>)
+endif()
+
+# Short-Term solution for pre-compiled headers for online compilation
+# Enable pre compiled header
+if(__HIP_ENABLE_PCH)
+    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${LLVM_ROOT} ${HSA_PATH}" COMMAND_ECHO STDERR RESULT_VARIABLE EMBED_PCH_RC)
+    if (EMBED_PCH_RC AND NOT EMBED_PCH_RC EQUAL 0)
+      message(FATAL_ERROR "Failed to embed PCH")
+    endif()
+    add_definitions(-D__HIP_ENABLE_PCH)
+endif()
+
+# Enable preprocessed hiprtc-builtins library
+if(__HIP_ENABLE_RTC)
+  if(WIN32)
+    set(HIPRTC_LIB_NAME "hiprtc-builtins64_${HIP_LIB_VERSION_MAJOR}${HIP_LIB_VERSION_MINOR}.dll")
+  else()
+    set(HIPRTC_LIB_NAME "libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}")
+  endif()
+  execute_process(
+    COMMAND sh -c "mkdir -p ${PROJECT_BINARY_DIR}/lib; ${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${LLVM_ROOT} ${HSA_PATH} -r ${PROJECT_BINARY_DIR}/lib/${HIPRTC_LIB_NAME}"
+    COMMAND_ECHO STDERR
+    RESULT_VARIABLE EMBED_RTC_RC
+  )
+  if (EMBED_RTC_RC AND NOT EMBED_RTC_RC EQUAL 0)
+    message(FATAL_ERROR "Failed to create hiprtc shared lib")
+  endif()
+  install(FILES ${PROJECT_BINARY_DIR}/lib/${HIPRTC_LIB_NAME} DESTINATION lib)
+endif()
+
+# Enable profiling API
+if(USE_PROF_API EQUAL 1)
+  find_path(PROF_API_HEADER_DIR prof_protocol.h
+    HINTS
+      ${PROF_API_HEADER_PATH}
+    PATHS
+      ${ROCM_PATH}/roctracer
+    PATH_SUFFIXES
+      include/ext)
+
+  if(NOT PROF_API_HEADER_DIR)
+    message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
+    else()
+      target_compile_definitions(hip64 PUBLIC USE_PROF_API=1)
+      target_include_directories(hip64 PUBLIC ${PROF_API_HEADER_DIR})
+      message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
+    endif()
+endif()
+
+set_target_properties(
+    hip64 PROPERTIES
+        CXX_STANDARD 14
+        CXX_STANDARD_REQUIRED ON
+        CXX_EXTENSIONS OFF
+        )
+add_dependencies(hip64 gen-prof-api-str-header)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+
+if(${BUILD_SHARED_LIBS})
+
+  add_library(amdhip64
+      $<TARGET_OBJECTS:hip64>
+  )
+
+  set_target_properties(
+      amdhip64 PROPERTIES
+         VERSION ${HIP_LIB_VERSION_STRING}
+         SOVERSION ${HIP_LIB_VERSION_MAJOR}
+      )
+
+  set_target_properties(hip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
+
+else()
+
+   add_library(amdhip64 STATIC
+      $<TARGET_OBJECTS:hip64>
+      )
+
+endif()
+
+set_target_properties(amdhip64 PROPERTIES LINK_FLAGS_RELEASE -s)
+set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
+add_library(host INTERFACE)
+target_link_libraries(host INTERFACE amdhip64)
+
+add_library(device INTERFACE)
+target_link_libraries(device INTERFACE host)
+
+# Short-Term solution for pre-compiled headers for online compilation
+if(__HIP_ENABLE_PCH)
+  target_sources(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
+endif()
+
+target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
+add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+    ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
+add_custom_command(TARGET amdhip64 POST_BUILD COMMAND cp -rf
+    ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/)
+
+INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
+
+INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
+INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
+
+INSTALL(TARGETS amdhip64 host device EXPORT hip-lang-targets DESTINATION ${LIB_INSTALL_DIR})
+INSTALL(EXPORT hip-lang-targets DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR} NAMESPACE hip-lang::)
+
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(
+    ${PROJECT_SOURCE_DIR}/hip-lang-config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
+    INSTALL_DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR}
+    PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
+    )
+
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
+    VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
+    COMPATIBILITY SameMajorVersion
+    )
+install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
+    DESTINATION
+    ${CONFIG_LANG_PACKAGE_INSTALL_DIR}/
+    )
diff --git a/rocclr/amd_hsa_elf.hpp b/rocclr/amd_hsa_elf.hpp
new file mode 100644
index 0000000000..c5a7ca40a1
--- /dev/null
+++ b/rocclr/amd_hsa_elf.hpp
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2015-2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// AMDGPU OS for HSA compatible compute kernels.
+enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 };
+
+enum {
+  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+  ELFABIVERSION_AMDGPU_HSA_V4 = 2
+};
+
+// AMDGPU specific e_flags
+enum : unsigned {
+  EF_AMDGPU_MACH = 0x0ff,
+  // AMDGPU processors
+  EF_AMDGPU_MACH_NONE = 0x000,
+  EF_AMDGPU_MACH_R600_R600 = 0x001,
+  EF_AMDGPU_MACH_R600_R630 = 0x002,
+  EF_AMDGPU_MACH_R600_RS880 = 0x003,
+  EF_AMDGPU_MACH_R600_RV670 = 0x004,
+  EF_AMDGPU_MACH_R600_RV710 = 0x005,
+  EF_AMDGPU_MACH_R600_RV730 = 0x006,
+  EF_AMDGPU_MACH_R600_RV770 = 0x007,
+  EF_AMDGPU_MACH_R600_CEDAR = 0x008,
+  EF_AMDGPU_MACH_R600_CYPRESS = 0x009,
+  EF_AMDGPU_MACH_R600_JUNIPER = 0x00a,
+  EF_AMDGPU_MACH_R600_REDWOOD = 0x00b,
+  EF_AMDGPU_MACH_R600_SUMO = 0x00c,
+  EF_AMDGPU_MACH_R600_BARTS = 0x00d,
+  EF_AMDGPU_MACH_R600_CAICOS = 0x00e,
+  EF_AMDGPU_MACH_R600_CAYMAN = 0x00f,
+  EF_AMDGPU_MACH_R600_TURKS = 0x010,
+  EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011,
+  EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f,
+  EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
+  EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
+
+  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X027 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
+  // Indicates if the "sramecc" target feature is enabled for all code
+  // contained in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
+
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
+  EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
+  EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
+  EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
+  EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
+
+  // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
+  EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
+  EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
+  EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
+  EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+};
diff --git a/rocclr/cl_gl.cpp b/rocclr/cl_gl.cpp
new file mode 100644
index 0000000000..6a32525da2
--- /dev/null
+++ b/rocclr/cl_gl.cpp
@@ -0,0 +1,2432 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "top.hpp"
+
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include <d3d9.h>
+#include <dxgi.h>
+// This is necessary since there are common GL/D3D10 functions
+#include "cl_d3d9_amd.hpp"
+#include "cl_d3d10_amd.hpp"
+#include "cl_d3d11_amd.hpp"
+#endif  //_WIN32
+
+#include <GL/gl.h>
+#include <GL/glext.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <EGL/eglplatform.h>
+
+#include "cl_common.hpp"
+#include "cl_gl_amd.hpp"
+
+#include "device/device.hpp"
+
+/* The pixel internal format for DOPP texture defined in gl_enum.h */
+#define GL_BGR8_ATI 0x8083
+#define GL_BGRA8_ATI 0x8088
+
+#include <cstring>
+#include <vector>
+
+
+/*! \addtogroup API
+ *  @{
+ *
+ *  \addtogroup CL_GL_Interops
+ *
+ * This section discusses OpenCL functions that allow applications to
+ * use OpenGL buffer/texture/render-buffer objects as OpenCL memory
+ * objects. This allows efficient sharing of data between these OpenCL
+ * and OpenGL. The OpenCL API can be used to execute kernels that read
+ * and/or write memory objects that are also an OpenGL buffer object
+ * or a texture.  An OpenCL image object can be created from an OpenGL
+ * texture or renderbuffer object. An OpenCL buffer object can be
+ * created from an OpenGL buffer object.  An OpenCL memory object can
+ * be created from an OpenGL texture/buffer/render-buffer object or
+ * the default system provided framebuffer if any only if the OpenCL
+ * clContext has been created from a GL clContext. OpenGL contexts are
+ * created using platform specific APIs (EGL, CGL, WGL, GLX are some
+ * of the platform specific APIs that allow applications to create GL
+ * contexts). The appropriate platform API (such as EGL, CGL, WGL,
+ * GLX) will be extended to allow a CL clContext to be created from a
+ * GL clContext. Creating an OpenCL memory object from the default
+ * system provided framebuffer will also require an appropriate
+ * extension to the platform API. Refer to the appropriate platform
+ * API documentation to understand how to create a CL clContext from a
+ * GL clContext and creating a CL memory object from the default
+ * system provided framebuffer.
+ *
+ *  @{
+ *
+ *  \addtogroup clCreateFromGLBuffer
+ *
+ *  @{
+ */
+
+/*! \brief Creates an OpenCL buffer object from an OpenGL buffer object.
+ *
+ *  \param clContext is a valid OpenCL clContext created from an OpenGL clContext.
+ *
+ *  \param clFlags is a bit-field that is used to specify usage information. Only
+ *  CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY and CL_MEM_READ_WRITE can be used.
+ *
+ *  \param glBufferName is a GL buffer object name. The GL buffer
+ *  object must have a data store created though it does not need to
+ *  be initialized. The size of the data store will be used to
+ *  determine the size of the CL buffer object.
+ *
+ *  \param pCpuMem is a pointer to the buffer data that may already be
+ *  allocated by the application. The size of the buffer that pCpuMem points
+ *  to must be >= \a size bytes. Passing in a pointer to an already allocated
+ *  buffer on the host and using it as a buffer object allows applications to
+ *  share data efficiently with kernels and the host.
+ *
+ *  \param errcode_ret will return an appropriate error code. If errcode_ret
+ *  is NULL, no error code is returned.
+ *
+ *  \return valid non-zero OpenCL buffer object and errcode_ret is set
+ *  to CL_SUCCESS if the buffer object is created successfully. It
+ *  returns a NULL value with one of the following error values
+ *  returned in \a errcode_ret:
+ *  - CL_INVALID_CONTEXT if \a clContext is not a valid clContext.
+ *  - CL_INVALID_VALUE if values specified in \a clFlags are not valid.
+ *  - CL_INVALID_GL_OBJECT if glBufferName is not a GL buffer object or is a
+ *    GL buffer object but does not have a data store created.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required
+ *    by the runtime.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLBuffer,
+                  (cl_context context, cl_mem_flags flags, GLuint bufobj, cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  return (amd::clCreateFromGLBufferAMD(*as_amd(context), flags, bufobj, errcode_ret));
+}
+RUNTIME_EXIT
+
+/*! \brief creates the following:
+ *  - an OpenCL 2D image object from an OpenGL 2D texture object
+ *    or a single face of an OpenGL cubemap texture object,
+ *  - an OpenCL 2D image array object from an OpenGL 2D texture array object,
+ *  - an OpenCL 1D image object from an OpenGL 1D texture object,
+ *  - an OpenCL 1D image buffer object from an OpenGL texture buffer object,
+ *  - an OpenCL 1D image array object from an OpenGL 1D texture array object,
+ *  - an OpenCL 3D image object from an OpenGL 3D texture object.
+ *
+ *  \param clContext is a valid OpenCL clContext created from an OpenGL clContext.
+ *
+ *  \param clFlags is a bit-field that is used to specify usage information.
+ *  Only CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY and CL_MEM_READ_WRITE values
+ *  can be used.
+ *
+ *  \param texture_target must be GL_TEXTURE_1D, GL_TEXTURE_1D_ARRAY,
+ *  GL_TEXTURE_BUFFER, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D,
+ *  GL_TEXTURE_2D, GL_TEXTURE_CUBE_MAP_POSITIVE_X,
+ *  GL_TEXTURE_CUBE_MAP_POSITIVE_Y, GL_TEXTURE_CUBE_MAP_POSITIVE_Z,
+ *  GL_TEXTURE_CUBE_MAP_NEGATIVE_X, GL_TEXTURE_CUBE_MAP_NEGATIVE_Y,
+ *  GL_TEXTURE_CUBE_MAP_NEGATIVE_Z or GL_TEXTURE_RECTANGLE_ARB.
+ *
+ *  \param miplevel is the mipmap level to be used. If \a texture_target
+ *  is GL_TEXTURE_BUFFER, \a miplevel must be 0.
+ *
+ *  \param texture is a GL 1D, 2D, 3D, 1D array, 2D array, cubemap,
+ *  rectangle or buffer texture object.
+ *  The texture object must be a complete texture as per
+ *  OpenGL rules on texture completeness. The texture format and dimensions
+ *  defined by OpenGL for the specified miplevel of the texture will be
+ *  used to create the OpenCL image memory object. Only GL texture formats
+ *  that map to appropriate image channel order and data type can be used
+ *  to create the the OpenCL image memory object.
+ *
+ *  \param errcode_ret will return an appropriate error code. If \a
+ *  errcode_ret is NULL, no error code is returned.
+ *
+ *  \return A valid non-zero OpenCL image object and \a errcode_ret is set to
+ *  CL_SUCCESS if the image object is created successfully. It returns a NULL value
+ *  with one of the following error values returned in \a errcode_ret:
+ *  - CL_INVALID_CONTEXT if \a clContext is not a valid clContext or was not
+ *    created from a GL clContext.
+ *  - CL_INVALID_VALUE if values specified in \a clFlags are not valid.
+ *  - CL_INVALID_MIP_LEVEL if \a miplevel is not a valid mip-level for \a texture.
+ *  - CL_INVALID_GL_OBJECT if \a texture is not an appropriate GL 2D texture,
+ *    cubemap or texture rectangle.
+ *  - CL_INVALID_IMAGE_FORMAT_DESCRIPTOR if the OpenGL texture format does not
+ *    map to an appropriate OpenCL image format.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required
+ *    by the runtime.
+ *
+ *  \version 1.2r07
+ */
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLTexture,
+                  (cl_context context, cl_mem_flags flags, GLenum texture_target, GLint miplevel,
+                   GLuint texture, cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  const std::vector<amd::Device*>& devices = as_amd(context)->devices();
+  bool supportPass = false;
+  bool sizePass = false;
+  for (const auto& it : devices) {
+    if (it->info().imageSupport_) {
+      supportPass = true;
+    }
+  }
+  if (!supportPass) {
+    *not_null(errcode_ret) = CL_INVALID_OPERATION;
+    LogWarning("there are no devices in context to support images");
+    return static_cast<cl_mem>(0);
+  }
+
+  return amd::clCreateFromGLTextureAMD(*as_amd(context), flags, texture_target, miplevel, texture,
+                                       errcode_ret);
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clCreateFromGLTexture2D
+ *  @{
+ */
+
+/*! \brief Create an OpenCL 2D image object from an OpenGL 2D texture object.
+ *
+ *  \param clContext is a valid OpenCL clContext created from an OpenGL clContext.
+ *
+ *  \param clFlags is a bit-field that is used to specify usage information.
+ *  Only CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY and CL_MEM_READ_WRITE values
+ *  can be used.
+ *
+ *  \param target must be GL_TEXTURE_2D, GL_TEXTURE_CUBE_MAP_POSITIVE_X,
+ *  GL_TEXTURE_CUBE_MAP_POSITIVE_Y, GL_TEXTURE_CUBE_MAP_POSITIVE_Z,
+ *  GL_TEXTURE_CUBE_MAP_NEGATIVE_X, GL_TEXTURE_CUBE_MAP_NEGATIVE_Y,
+ *  GL_TEXTURE_CUBE_MAP_NEGATIVE_Z or GL_TEXTURE_RECTANGLE_ARB.
+ *
+ *  \param miplevel is the mipmap level to be used.
+ *
+ *  \param texture is a GL 2D texture, cubemap or texture rectangle
+ *  object name.  The texture object must be a complete texture as per
+ *  OpenGL rules on texture completeness. The \a texture format and
+ *  dimensions specified using appropriate glTexImage2D call for \a
+ *  miplevel will be used to create the 2D image object.  Only GL
+ *  texture formats that map to appropriate image channel order and
+ *  data type can be used to create the 2D image object.
+ *
+ *  \param errcode_ret will return an appropriate error code. If \a
+ *  errcode_ret is NULL, no error code is returned.
+ *
+ *  \return A valid non-zero OpenCL image object and \a errcode_ret is set to
+ *  CL_SUCCESS if the image object is created successfully. It returns a NULL value
+ *  with one of the following error values returned in \a errcode_ret:
+ *  - CL_INVALID_CONTEXT if \a clContext is not a valid clContext or was not
+ *    created from a GL clContext.
+ *  - CL_INVALID_VALUE if values specified in \a clFlags are not valid.
+ *  - CL_INVALID_MIP_LEVEL if \a miplevel is not a valid mip-level for \a texture.
+ *  - CL_INVALID_GL_OBJECT if \a texture is not an appropriate GL 2D texture,
+ *    cubemap or texture rectangle.
+ *  - CL_INVALID_IMAGE_FORMAT_DESCRIPTOR if the OpenGL texture format does not
+ *    map to an appropriate OpenCL image format.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required
+ *    by the runtime.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLTexture2D,
+                  (cl_context context, cl_mem_flags flags, GLenum target, GLint miplevel,
+                   GLuint texture, cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  const std::vector<amd::Device*>& devices = as_amd(context)->devices();
+  bool supportPass = false;
+  bool sizePass = false;
+  for (const auto& it : devices) {
+    if (it->info().imageSupport_) {
+      supportPass = true;
+    }
+  }
+  if (!supportPass) {
+    *not_null(errcode_ret) = CL_INVALID_OPERATION;
+    LogWarning("there are no devices in context to support images");
+    return static_cast<cl_mem>(0);
+  }
+
+  return amd::clCreateFromGLTextureAMD(*as_amd(context), flags, target, miplevel, texture,
+                                       errcode_ret);
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clCreateFromGLTexture3D
+ *  @{
+ */
+
+/*! \brief Create an OpenCL 3D image object from an OpenGL 3D texture object.
+ *
+ *  \param clContext is a valid OpenCL clContext created from an OpenGL clContext.
+ *
+ *  \param clFlags is a bit-field that is used to specify usage information.
+ *  Only CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY and CL_MEM_READ_WRITE values
+ *  can be used.
+ *
+ *  \param target must be GL_TEXTURE_3D.
+ *
+ *  \param miplevel is the mipmap level to be used.
+ *
+ *  \param texture is a GL 3D texture object [name].
+ *  The texture object must be a complete texture as per OpenGL rules on texture
+ *  completeness. The \a texture format and dimensions specified using appropriate
+ *  glTexImage3D call for \a miplevel will be used to create the 3D image object.
+ *  Only GL texture formats that map to appropriate image channel order and
+ *  data type can be used to create the 3D image object.
+ *
+ *  \param errcode_ret will return an appropriate error code. If \a errcode_ret
+ *  is NULL, no error code is returned.
+ *
+ *  \return A valid non-zero OpenCL image object and \a errcode_ret is set to
+ *  CL_SUCCESS if the image object is created successfully. It returns a NULL value
+ *  with one of the following error values returned in \a errcode_ret:
+ *  - CL_INVALID_CONTEXT if \a clContext is not a valid clContext or was not
+ *    created from a GL clContext.
+ *  - CL_INVALID_VALUE if values specified in \a clFlags are not valid.
+ *  - CL_INVALID_MIP_LEVEL if \a miplevel is not a valid mip-level for \a texture.
+ *  - CL_INVALID_GL_OBJECT if \a texture is not an GL 3D texture.
+ *  - CL_INVALID_IMAGE_FORMAT_DESCRIPTOR if the OpenGL texture format does not
+ *    map to an appropriate OpenCL image format.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required
+ *    by the runtime.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLTexture3D,
+                  (cl_context context, cl_mem_flags flags, GLenum target, GLint miplevel,
+                   GLuint texture, cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  const std::vector<amd::Device*>& devices = as_amd(context)->devices();
+  bool supportPass = false;
+  bool sizePass = false;
+  for (const auto& it : devices) {
+    if (it->info().imageSupport_) {
+      supportPass = true;
+    }
+  }
+  if (!supportPass) {
+    *not_null(errcode_ret) = CL_INVALID_OPERATION;
+    LogWarning("there are no devices in context to support images");
+    return static_cast<cl_mem>(0);
+  }
+
+  return amd::clCreateFromGLTextureAMD(*as_amd(context), flags, target, miplevel, texture,
+                                       errcode_ret);
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clCreateFromGLRenderbuffer
+ *  @{
+ */
+
+/*! \brief Create an OpenCL 2D image object from an OpenGL renderbuffer object.
+ *
+ *  \param clContext is a valid OpenCL clContext created from an OpenGL clContext.
+ *
+ *  \param clFlags is a bit-field that is used to specify usage information.
+ *  Only CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY and CL_MEM_READ_WRITE values
+ *  can be used.
+ *
+ *  \param renderbuffer is a GL renderbuffer object name. The renderbuffer
+ *  storage must be specified before the image object can be created. Only
+ *  GL renderbuffer formats that map to appropriate image channel order and
+ *  data type can be used to create the 2D image object.
+ *
+ *  \param errcode_ret will return an appropriate error code. If \a errcode_ret
+ *  is NULL, no error code is returned.
+ *
+ *  \return A valid non-zero OpenCL image object and \a errcode_ret is set
+ *  to CL_SUCCESS if the image object is created successfully. It returns a
+ *  NULL value with one of the following error values returned in \a errcode_ret:
+ *  - CL_INVALID_CONTEXT if \a clContext is not a valid clContext or was not
+ *    created from a GL clContext.
+ *  - CL_INVALID_VALUE if values specified in \a clFlags are not valid.
+ *  - CL_INVALID_GL_OBJECT if \a renderbuffer is not an GL renderbuffer object.
+ *  - CL_INVALID_IMAGE_FORMAT_DESCRIPTOR if the OpenGL renderbuffer format
+ *    does not map to an appropriate OpenCL image format.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required
+ *    by the runtime.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLRenderbuffer, (cl_context context, cl_mem_flags flags,
+                                                       GLuint renderbuffer, cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  return (amd::clCreateFromGLRenderbufferAMD(*as_amd(context), flags, renderbuffer, errcode_ret));
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clGetGLObjectInfo
+ *  @{
+ */
+
+/*! \brief Query GL object type from a CL memory object.
+ *
+ *  \param memobj [is a valid cl_mem object created from a GL object].
+ *
+ *  \param gl_object_type returns the type of GL object attached to memobj
+ *  and can be CL_GL_OBJECT_BUFFER, CL_GL_OBJECT_TEXTURE2D,
+ *  CL_GL_OBJECT_TEXTURE_RECTANGLE, CL_GL_OBJECT_TEXTURE3D, or
+ *  CL_GL_OBJECT_RENDERBUFFER. If \a gl_object_type is NULL, it is ignored.
+ *
+ *  \param gl_object_name returns the GL object name used to create memobj.
+ *  If \a gl_object_name is NULL, it is ignored.
+ *
+ *  \return One of the following values is returned:
+ *  - CL_SUCCESS if the call was executed successfully.
+ *  - CL_INVALID_MEM_OBJECT if \a memobj is not a valid OpenCL memory object.
+ *  - CL_INVALID_GL_OBJECT if there is no GL object associated with \a memobj.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY(cl_int, clGetGLObjectInfo,
+              (cl_mem memobj, cl_gl_object_type* gl_object_type, GLuint* gl_object_name)) {
+  if (!is_valid(memobj)) {
+    LogWarning("\"memobj\" is not a  valid cl_mem object");
+    return CL_INVALID_MEM_OBJECT;
+  }
+
+  amd::InteropObject* interop = as_amd(memobj)->getInteropObj();
+  if (NULL == interop) {
+    LogWarning("CL object \"memobj\" is not created from GL object");
+    return CL_INVALID_GL_OBJECT;
+  }
+
+  amd::GLObject* glObject = interop->asGLObject();
+  if (NULL == glObject) {
+    LogWarning("CL object \"memobj\" is not created from GL object");
+    return CL_INVALID_GL_OBJECT;
+  }
+
+  cl_int result;
+
+  cl_gl_object_type clGLType = glObject->getCLGLObjectType();
+  result = amd::clGetInfo(clGLType, sizeof(cl_gl_object_type), gl_object_type, NULL);
+
+  GLuint glName = glObject->getGLName();
+  result |= amd::clGetInfo(glName, sizeof(GLuint), gl_object_name, NULL);
+
+  return result;
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clGetGLTextureInfo
+ *  @{
+ */
+
+/*! \brief Query additional information about the GL texture object associated
+ *  with \a memobj.
+ *
+ *  \param memobj [is a valid cl_mem object created from a GL object].
+ *
+ *  \param param_name specifies what additional information about the GL
+ *  texture object associated with \a memobj to query:
+ *  - CL_GL_TEXTURE_TARGET (GLenum) to query the \a target argument specified
+ *    in clCreateGLTexture2D or clCreateGLTexture3D calls.
+ *  - CL_GL_MIPMAP_LEVEL (GLint) to query the \a miplevel argument specified
+ *    in clCreateGLTexture2D or clCreateGLTexture3D calls.
+ *
+ *  \param param_value is a pointer to memory where the appropriate result
+ *  being queried is returned. If \a param_value is NULL, it is ignored.
+ *
+ *  \param param_value_size is used to specify the size in bytes of memory
+ *  pointed to by \a param_value. This size must be >= size of return type as
+ *  described for \a param_name argumnet (GLenum or GLint).
+ *  \a param_value_size_ret returns the actual size in bytes of data copied to
+ *  \a param_value. If \a param_value_size_ret is NULL, it is ignored
+ *
+ *  \return One of the following values is returned:
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_INVALID_MEM_OBJECT if \a memobj is not a valid OpenCL memory object.
+ *  - CL_INVALID_GL_OBJECT if there is no GL texture object (2D or 3D texture)
+ *    associated with \a memobj.
+ *  - CL_INVALID_VALUE if \a param_name is not valid, or if size in bytes
+ *    specified by \a param_value_size is < size of return type required by
+ *    \a param_name and \a param_value is not NULL, or if \a param_value and
+ *    \a param_value_size_ret are NULL.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY(cl_int, clGetGLTextureInfo,
+              (cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
+               void* param_value, size_t* param_value_size_ret)) {
+  if (!is_valid(memobj)) {
+    LogWarning("\"memobj\" is not a  valid cl_mem object");
+    return CL_INVALID_MEM_OBJECT;
+  }
+  amd::InteropObject* interop = as_amd(memobj)->getInteropObj();
+  if (NULL == interop) {
+    LogWarning("CL object \"memobj\" is not created from GL object");
+    return CL_INVALID_GL_OBJECT;
+  }
+  amd::GLObject* glObject = interop->asGLObject();
+  if ((NULL == glObject) || (NULL != glObject->asBufferGL())) {
+    LogWarning("CL object \"memobj\" is not created from GL texture");
+    return CL_INVALID_GL_OBJECT;
+  }
+
+  switch (param_name) {
+    case CL_GL_TEXTURE_TARGET: {
+      GLenum glTarget = glObject->getGLTarget();
+      if (glTarget == GL_TEXTURE_CUBE_MAP) {
+        glTarget = glObject->getCubemapFace();
+      }
+      return amd::clGetInfo(glTarget, param_value_size, param_value, param_value_size_ret);
+    }
+    case CL_GL_MIPMAP_LEVEL: {
+      GLint mipLevel = glObject->getGLMipLevel();
+      return amd::clGetInfo(mipLevel, param_value_size, param_value, param_value_size_ret);
+    }
+    case CL_GL_NUM_SAMPLES: {
+      GLsizei numSamples = glObject->getNumSamples();
+      return amd::clGetInfo(numSamples, param_value_size, param_value, param_value_size_ret);
+    }
+    default:
+      LogWarning("Unknown param_name in clGetGLTextureInfoAMD");
+      break;
+  }
+
+  return CL_INVALID_VALUE;
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clEnqueueAcquireExtObjects
+ *  @{
+ */
+
+/*! \brief Acquire OpenCL memory objects that have been created from external
+ *  objects (OpenGL, D3D).
+ *
+ *  \param command_queue is a valid command-queue.
+ *
+ *  \param num_objects is the number of memory objects to be acquired
+ *  in \a mem_objects.
+ *
+ *  \param mem_objects is a pointer to a list of CL memory objects that refer
+ *  to a GL object (buffer/texture/renderbuffer objects or the framebuffer).
+ *
+ *  \param event_wait_list specify [is a pointer to] events that need to
+ *  complete before this particular command can be executed.
+ *  If \a event_wait_list is NULL, then this particular command does not wait
+ *  on any event to complete. If \a event_wait_list is NULL,
+ *  \a num_events_in_wait_list must be 0. If \a event_wait_list is not NULL,
+ *  the list of events pointed to by \a event_wait_list must be valid and
+ *  \a num_events_in_wait_list must be greater than 0. The events specified in
+ *  \a event_wait_list act as synchronization points.
+ *
+ *  \param num_events_in_wait_list specify the number of events in
+ *  \a event_wait_list. It must be 0 if \a event_wait_list is NULL. It  must be
+ *  greater than 0 if \a event_wait_list is not NULL.
+ *
+ *  \param event returns an event object that identifies this particular
+ *  command and can be used to query or queue a wait for this particular
+ *  command to complete. \a event can be NULL in which case it will not be
+ *  possible for the application to query the status of this command or queue a
+ *  wait for this command to complete.
+ *
+ *  \return One of the following values is returned:
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_SUCCESS if \a num_objects is 0 and \a mem_objects is NULL; the
+ *    function does nothing.
+ *  - CL_INVALID_VALUE if \a num_objects is zero and \a mem_objects is not a
+ *    NULL value or if \a num_objects > 0 and \a mem_objects is NULL.
+ *  - CL_INVALID_MEM_OBJECT if memory objects in \a mem_objects are not valid
+ *    OpenCL memory objects.
+ *  - CL_INVALID_COMMAND_QUEUE if \a command_queue is not a valid command-queue.
+ *  - CL_INVALID_CONTEXT if clContext associated with \a command_queue was not
+ *    created from an OpenGL clContext.
+ *  - CL_INVALID_GL_OBJECT if memory objects in \a mem_objects have not been
+ *    created from a GL object(s).
+ *  - CL_INVALID_EVENT_WAIT_LIST if \a event_wait_list is NULL and
+ *    \a num_events_in_wait_list > 0, or \a event_wait_list is not NULL and
+ *    \a num_events_in_wait_list is 0, or if event objects in \a event_wait_list
+ *    are not valid events.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources
+ *    required by the OpenCL implementation on the host.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY(cl_int, clEnqueueAcquireGLObjects,
+              (cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return amd::clEnqueueAcquireExtObjectsAMD(command_queue, num_objects, mem_objects,
+                                            num_events_in_wait_list, event_wait_list, event,
+                                            CL_COMMAND_ACQUIRE_GL_OBJECTS);
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clEnqueueReleaseGLObjects
+ *  @{
+ */
+
+/*! \brief Release OpenCL memory objects that have been created from OpenGL
+ *  objects.
+ *
+ *  \param command_queue is a valid command-queue [which is associated with the
+ *  OpenCL clContext releasing the OpenGL objects].
+ *
+ *  \param num_objects is the number of memory objects to be released
+ *  in \a mem_objects.
+ *
+ *  \param mem_objects is a pointer to a list of CL memory objects that refer
+ *  to a GL object (buffer/texture/renderbuffer objects or the framebuffer).
+ *
+ *  \param event_wait_list specify [is a pointer to] events that need to
+ *  complete before this particular command can be executed.
+ *  If \a event_wait_list is NULL, then this particular command does not wait
+ *  on any event to complete. If \a event_wait_list is NULL,
+ *  \a num_events_in_wait_list must be 0. If \a event_wait_list is not NULL,
+ *  the list of events pointed to by \a event_wait_list must be valid and
+ *  \a num_events_in_wait_list must be greater than 0. The events specified in
+ *  \a event_wait_list act as synchronization points.
+ *
+ *  \param num_events_in_wait_list specify the number of events in
+ *  \a event_wait_list. It must be 0 if \a event_wait_list is NULL. It  must be
+ *  greater than 0 if \a event_wait_list is not NULL.
+ *
+ *  \param event returns an event object that identifies this particular
+ *  command and can be used to query or queue a wait for this particular
+ *  command to complete. \a event can be NULL in which case it will not be
+ *  possible for the application to query the status of this command or queue a
+ *  wait for this command to complete.
+ *
+ *  \return One of the following values is returned:
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_SUCCESS if \a num_objects is 0 and \a mem_objects is NULL; the
+ *    function does nothing.
+ *  - CL_INVALID_VALUE if \a num_objects is zero and \a mem_objects is not a
+ *    NULL value or if \a num_objects > 0 and \a mem_objects is NULL.
+ *  - CL_INVALID_MEM_OBJECT if memory objects in \a mem_objects are not valid
+ *    OpenCL memory objects.
+ *  - CL_INVALID_COMMAND_QUEUE if \a command_queue is not a valid command-queue.
+ *  - CL_INVALID_CONTEXT if clContext associated with \a command_queue was not
+ *    created from an OpenGL clContext.
+ *  - CL_INVALID_GL_OBJECT if memory objects in \a mem_objects have not been
+ *    created from a GL object(s).
+ *  - CL_INVALID_EVENT_WAIT_LIST if \a event_wait_list is NULL and
+ *    \a num_events_in_wait_list > 0, or \a event_wait_list is not NULL and
+ *    \a num_events_in_wait_list is 0, or if event objects in \a event_wait_list
+ *    are not valid events.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources
+ *    required by the OpenCL implementation on the host.
+ *
+ *  \version 1.0r29
+ */
+RUNTIME_ENTRY(cl_int, clEnqueueReleaseGLObjects,
+              (cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return amd::clEnqueueReleaseExtObjectsAMD(command_queue, num_objects, mem_objects,
+                                            num_events_in_wait_list, event_wait_list, event,
+                                            CL_COMMAND_RELEASE_GL_OBJECTS);
+}
+RUNTIME_EXIT
+
+/*! @}
+*  \addtogroup clCreateEventFromGLsyncKHR
+*  @{
+*/
+
+/*! \brief Creates an event object linked to an OpenGL sync object.
+*  Completion of such an event object is equivalent to waiting for completion
+*  of the fence command associated with the linked GL sync object.
+*
+*  \param context is valid OpenCL context created from an OpenGL context
+*  or share group, using the cl_khr_gl_sharing extension.
+*
+*  \param sync is the 'name' of a sync object in the GL share group associated
+*  with context.
+*
+*  \param errcode_ret Returns an appropriate error code as described below.
+*  If errcode_ret is NULL, no error code is returned.
+*
+*  \return a valid OpenCL event object and errcode_ret is set to CL_SUCCESS
+*  if the event object is created successfully.Otherwise, it returns a NULL
+*  value with one of the following error values returned in errcode_ret:
+*  - CL_INVALID_CONTEXT if context is not a valid context or was not created
+*    from a GL context.
+*  - CL_INVALID_GL_OBJECT if sync is not the name of a sync object in the
+*    GL share group associated with context.
+*
+*  \version 1.1
+*/
+
+RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
+                  (cl_context context, cl_GLsync clGLsync, cl_int* errcode_ret)) {
+  // create event of fence sync type
+  amd::ClGlEvent* clglEvent = new amd::ClGlEvent(*as_amd(context));
+  clglEvent->context().glenv()->glFlush_();
+  // initially set the status of fence as queued
+  clglEvent->setStatus(CL_SUBMITTED);
+  // store GLsync id of the fence in event in order to associate them together
+  clglEvent->setData(clGLsync);
+  amd::Event* evt = dynamic_cast<amd::Event*>(clglEvent);
+  evt->retain();
+  return as_cl(evt);
+}
+RUNTIME_EXIT
+
+/*! @}
+ *  \addtogroup clGetGLContextInfoKHR
+ *  @{
+ */
+
+/*! \brief This f-n is defined in CL extension cl_khr_gl_sharing and serves
+ *  the purpose of quering current device and all devices that support
+ *  CL-GL interoperability.
+ *
+ *  \param properties points to an <attribute list>, which is a array of
+ *  ordered <attribute name, value> pairs terminated with zero. If an
+ *  attribute is not specified in <properties>, then its default value
+ *  (listed in table 4.attr) is used (it is said to be specified
+ *  implicitly). If <properties> is NULL or empty (points to a list
+ *  whose first value is zero), all attributes take on their default
+ *  values.
+ *
+ *  \param param_name may accept one of the following enumerated values:
+ *  - CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR  0x2006
+ *  - CL_DEVICES_FOR_GL_CONTEXT_KHR         0x2007.
+ *
+ *  \param param_value_size is used to specify the size in bytes of memory
+ *  pointed to by \a param_value. This size must be >= size of return type as
+ *  described for \a param_name argumnet (GLenum or GLint).
+ *  \a param_value_size_ret returns the actual size in bytes of data copied to
+ *  \a param_value. If \a param_value_size_ret is NULL, it is ignored
+ *
+ *  \param param_value is a pointer to memory where the appropriate result
+ *  being queried is returned. If \a param_value is NULL, it is ignored.
+ *
+ *  \param param_value_size is used to specify the size in bytes of memory
+ *  pointed to by \a param_value. This size must be >= size of return type as
+ *  described for \a param_name argumnet (GLenum or GLint).
+ *  \a param_value_size_ret returns the actual size in bytes of data copied to
+ *  \a param_value. If \a param_value_size_ret is NULL, it is ignored
+ *
+ *  \return one of the following values is returned:
+ *  - CL_SUCCESS if the function is executed successfully.
+ *  - CL_SUCCESS if \a num_objects is 0 and \a mem_objects is NULL; the
+ *    function does nothing.
+ *  - CL_INVALID_VALUE if \a num_objects is zero and \a mem_objects is not a
+ *    NULL value or if \a num_objects > 0 and \a mem_objects is NULL.
+ *  - CL_INVALID_MEM_OBJECT if memory objects in \a mem_objects are not valid
+ *    OpenCL memory objects.
+ *  - CL_INVALID_COMMAND_QUEUE if \a command_queue is not a valid command-queue.
+ *  - CL_INVALID_CONTEXT if clContext associated with \a command_queue was not
+ *    created from an OpenGL clContext.
+ *  - CL_INVALID_GL_OBJECT if memory objects in \a mem_objects have not been
+ *    created from a GL object(s).
+ *  - CL_INVALID_EVENT_WAIT_LIST if \a event_wait_list is NULL and
+ *    \a num_events_in_wait_list > 0, or \a event_wait_list is not NULL and
+ *    \a num_events_in_wait_list is 0, or if event objects in \a event_wait_list
+ *    are not valid events.
+ *  - CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources
+ *    required by the OpenCL implementation on the host.
+ *  - CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR if
+ *
+ *  \version 1.0r47
+ */
+RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,
+              (const cl_context_properties* properties, cl_gl_context_info param_name,
+               size_t param_value_size, void* param_value, size_t* param_value_size_ret)) {
+  cl_int errcode=0;
+  cl_device_id* gpu_devices;
+  cl_uint num_gpu_devices = 0;
+  amd::Context::Info info;
+  static const bool VALIDATE_ONLY = true;
+
+  errcode = amd::Context::checkProperties(properties, &info);
+  if (CL_SUCCESS != errcode) {
+    return errcode;
+  }
+
+  if (!(info.flags_ & amd::Context::GLDeviceKhr)) {
+    // No GL context is specified
+    return CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR;
+  }
+
+  // Get devices
+  //errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 0, NULL, &num_gpu_devices);
+  if (errcode != CL_SUCCESS && errcode != CL_DEVICE_NOT_FOUND) {
+    return CL_INVALID_VALUE;
+  }
+
+  if (!num_gpu_devices) {
+    return CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR;
+  }
+
+  switch (param_name) {
+    case CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR:
+      // Return the CL device currently associated with the specified OpenGL context.
+      if (num_gpu_devices) {
+        gpu_devices = (cl_device_id*)alloca(num_gpu_devices * sizeof(cl_device_id));
+
+        //errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, num_gpu_devices, gpu_devices, NULL);
+        if (errcode != CL_SUCCESS) {
+          return errcode;
+        }
+
+        for (cl_uint i = 0; i < num_gpu_devices; ++i) {
+          cl_device_id device = gpu_devices[i];
+          if (is_valid(device) &&
+              as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
+                                                 VALIDATE_ONLY)) {
+            return amd::clGetInfo(device, param_value_size, param_value, param_value_size_ret);
+          }
+        }
+
+        *not_null(param_value_size_ret) = 0;
+      }
+      break;
+
+    case CL_DEVICES_FOR_GL_CONTEXT_KHR: {
+      // List of all CL devices that can be associated with the specified OpenGL context.
+      cl_uint total_devices = num_gpu_devices;
+      size_t size = total_devices * sizeof(cl_device_id);
+
+      cl_device_id* devices = (cl_device_id*)alloca(size);
+
+      //errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, total_devices, devices, NULL);
+      if (errcode != CL_SUCCESS) {
+        return errcode;
+      }
+
+      std::vector<amd::Device*> compatible_devices;
+
+      for (cl_uint i = 0; i < total_devices; ++i) {
+        cl_device_id device = devices[i];
+        if (is_valid(device) &&
+            as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
+                                               VALIDATE_ONLY)) {
+          compatible_devices.push_back(as_amd(device));
+        }
+      }
+
+      size_t deviceCount = compatible_devices.size();
+      size_t deviceCountSize = deviceCount * sizeof(cl_device_id);
+
+      if (param_value != NULL && param_value_size < deviceCountSize) {
+        return CL_INVALID_VALUE;
+      }
+
+      *not_null(param_value_size_ret) = deviceCountSize;
+
+      if (param_value != NULL) {
+        cl_device_id* deviceList = (cl_device_id*)param_value;
+        for (const auto& it : compatible_devices) {
+          *deviceList++ = as_cl(it);
+        }
+      }
+
+      return CL_SUCCESS;
+    } break;
+
+    default:
+      LogWarning("\"param_name\" is not valid");
+      return CL_INVALID_VALUE;
+  }
+  return CL_SUCCESS;
+}
+RUNTIME_EXIT
+
+//
+//
+//          namespace amd
+//
+//
+namespace amd {
+
+typedef struct {
+  GLenum glBinding;
+  GLenum glTarget;
+} TargetBindings_t;
+
+/*! @}
+ *  \addtogroup CL-GL interop helper functions
+ *  @{
+ */
+
+//! Function clearGLErrors() to clear all GL error bits, if any
+void clearGLErrors(const Context& amdContext) {
+  GLenum glErr, glLastErr = GL_NO_ERROR;
+  while (1) {
+    glErr = amdContext.glenv()->glGetError_();
+    if (glErr == GL_NO_ERROR || glErr == glLastErr) {
+      break;
+    }
+    glLastErr = glErr;
+    LogWarning("GL error");
+  }
+}
+
+GLenum checkForGLError(const Context& amdContext) {
+  GLenum glRetErr = GL_NO_ERROR;
+  GLenum glErr;
+  while (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+    glRetErr = glErr;  // Just return the last GL error
+    LogWarning("Check GL error");
+  }
+  return glRetErr;
+}
+
+//! Function getCLFormatFromGL returns "true" if GL format
+//! is compatible with CL format, "false" otherwise.
+bool getCLFormatFromGL(const Context& amdContext, GLint gliInternalFormat,
+                       cl_image_format* pclImageFormat, int* piBytesPerPixel, cl_mem_flags flags) {
+  bool bRetVal = false;
+
+  /*
+  Available values for "image_channel_order"
+  ==========================================
+  CL_R
+  CL_A
+  CL_INTENSITY
+  CL_LUMINANCE
+  CL_RG
+  CL_RA
+  CL_RGB
+  CL_RGBA
+  CL_ARGB
+  CL_BGRA
+
+  Available values for "image_channel_data_type"
+  ==============================================
+  CL_SNORM_INT8
+  CL_SNORM_INT16
+  CL_UNORM_INT8
+  CL_UNORM_INT16
+  CL_UNORM_SHORT_565
+  CL_UNORM_SHORT_555
+  CL_UNORM_INT_101010
+  CL_SIGNED_INT8
+  CL_SIGNED_INT16
+  CL_SIGNED_INT32
+  CL_UNSIGNED_INT8
+  CL_UNSIGNED_INT16
+  CL_UNSIGNED_INT32
+  CL_HALF_FLOAT
+  CL_FLOAT
+  */
+
+  switch (gliInternalFormat) {
+    case GL_RGB10_EXT:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT_101010;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RGB10_A2:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT_101010;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_BGR8_ATI:
+    case GL_BGRA8_ATI:
+      pclImageFormat->image_channel_order = CL_BGRA;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT8;  // CL_UNSIGNED_INT8;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_ALPHA8:
+      pclImageFormat->image_channel_order = CL_A;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT8;  // CL_UNSIGNED_INT8;
+      *piBytesPerPixel = 1;
+      bRetVal = true;
+      break;
+
+    case GL_R8:
+    case GL_R8UI:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_R8) ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
+      *piBytesPerPixel = 1;
+      bRetVal = true;
+      break;
+
+    case GL_R8I:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+      *piBytesPerPixel = 1;
+      bRetVal = true;
+      break;
+
+    case GL_RG8:
+    case GL_RG8UI:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RG8) ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
+      *piBytesPerPixel = 2;
+      bRetVal = true;
+      break;
+
+    case GL_RG8I:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+      *piBytesPerPixel = 2;
+      bRetVal = true;
+      break;
+
+    case GL_RGB8:
+    case GL_RGB8UI:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RGB8) ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
+      *piBytesPerPixel = 3;
+      bRetVal = true;
+      break;
+
+    case GL_RGB8I:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+      *piBytesPerPixel = 3;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA:
+    case GL_RGBA8:
+    case GL_RGBA8UI:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RGBA8UI) ? CL_UNSIGNED_INT8 : CL_UNORM_INT8;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA8I:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT8;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_R16:
+    case GL_R16UI:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_R16) ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+      bRetVal = true;
+      *piBytesPerPixel = 2;
+      break;
+
+    case GL_R16I:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+      *piBytesPerPixel = 2;
+      bRetVal = true;
+      break;
+
+    case GL_R16F:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+      *piBytesPerPixel = 2;
+      bRetVal = true;
+      break;
+
+    case GL_RG16:
+    case GL_RG16UI:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RG16) ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RG16I:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RG16F:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RGB16:
+    case GL_RGB16UI:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RGB16) ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+      *piBytesPerPixel = 6;
+      bRetVal = true;
+      break;
+
+    case GL_RGB16I:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+      *piBytesPerPixel = 6;
+      bRetVal = true;
+      break;
+
+    case GL_RGB16F:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+      *piBytesPerPixel = 6;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA16:
+    case GL_RGBA16UI:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type =
+          (gliInternalFormat == GL_RGBA16) ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA16I:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT16;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA16F:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_HALF_FLOAT;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_R32I:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_R32UI:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_R32F:
+      pclImageFormat->image_channel_order = CL_R;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+
+    case GL_RG32I:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_RG32UI:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_RG32F:
+      pclImageFormat->image_channel_order = CL_RG;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 8;
+      bRetVal = true;
+      break;
+
+    case GL_RGB32I:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+      *piBytesPerPixel = 12;
+      bRetVal = true;
+      break;
+
+    case GL_RGB32UI:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+      *piBytesPerPixel = 12;
+      bRetVal = true;
+      break;
+
+    case GL_RGB32F:
+      pclImageFormat->image_channel_order = CL_RGB;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 12;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA32I:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_SIGNED_INT32;
+      *piBytesPerPixel = 16;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA32UI:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_UNSIGNED_INT32;
+      *piBytesPerPixel = 16;
+      bRetVal = true;
+      break;
+
+    case GL_RGBA32F:
+      pclImageFormat->image_channel_order = CL_RGBA;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 16;
+      bRetVal = true;
+      break;
+    case GL_DEPTH_COMPONENT32F:
+      pclImageFormat->image_channel_order = CL_DEPTH;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+    case GL_DEPTH_COMPONENT16:
+      pclImageFormat->image_channel_order = CL_DEPTH;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT16;
+      *piBytesPerPixel = 2;
+      bRetVal = true;
+      break;
+    case GL_DEPTH24_STENCIL8:
+      pclImageFormat->image_channel_order = CL_DEPTH_STENCIL;
+      pclImageFormat->image_channel_data_type = CL_UNORM_INT24;
+      *piBytesPerPixel = 4;
+      bRetVal = true;
+      break;
+    case GL_DEPTH32F_STENCIL8:
+      pclImageFormat->image_channel_order = CL_DEPTH_STENCIL;
+      pclImageFormat->image_channel_data_type = CL_FLOAT;
+      *piBytesPerPixel = 5;
+      bRetVal = true;
+      break;
+    default:
+      LogWarning("unsupported GL internal format");
+      break;
+  }
+  amd::Image::Format imageFormat(*pclImageFormat);
+  if (bRetVal && !imageFormat.isSupported(amdContext, 0, flags)) {
+    bRetVal = false;
+  }
+  return bRetVal;
+}
+
+void BufferGL::initDeviceMemory() {
+  deviceMemories_ =
+      reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(BufferGL));
+  memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory));
+}
+
+static GLenum clChannelDataTypeToGlType(cl_channel_type channel_type) {
+  // Pick
+  // GL_BYTE, GL_UNSIGNED_BYTE, GL_SHORT, GL_UNSIGNED_SHORT, GL_INT,
+  // GL_UNSIGNED_INT, GL_FLOAT, GL_2_BYTES, GL_3_BYTES, GL_4_BYTES
+  // or GL_DOUBLE
+  switch (channel_type) {
+    case CL_SNORM_INT8:
+      return GL_BYTE;
+    case CL_SNORM_INT16:
+      return GL_SHORT;
+    case CL_UNORM_INT8:
+      return GL_UNSIGNED_BYTE;
+    case CL_UNORM_INT16:
+      return GL_UNSIGNED_SHORT;
+    case CL_SIGNED_INT8:
+      return GL_BYTE;
+    case CL_SIGNED_INT16:
+      return GL_SHORT;
+    case CL_SIGNED_INT32:
+      return GL_INT;
+    case CL_UNSIGNED_INT8:
+      return GL_UNSIGNED_BYTE;
+    case CL_UNSIGNED_INT16:
+      return GL_UNSIGNED_SHORT;
+    case CL_UNSIGNED_INT32:
+      return GL_UNSIGNED_INT;
+    case CL_FLOAT:
+      return GL_FLOAT;
+    case CL_UNORM_INT_101010:
+      return GL_UNSIGNED_INT_10_10_10_2;
+    case CL_HALF_FLOAT:
+    case CL_UNORM_SHORT_565:
+    case CL_UNORM_SHORT_555:
+    default:
+      guarantee(false, "Unexpected CL type.");
+      return 0;
+  }
+}
+
+static GLenum glInternalFormatToGlFormat(GLenum internalFormat) {
+  switch (internalFormat) {
+    // Base internal formats
+    case GL_RGBA:
+    case GL_BGRA:
+      return internalFormat;
+    // Sized internal formats
+    case GL_RGBA8:
+    case GL_RGBA16:
+    case GL_RGBA16F:
+    case GL_RGBA32F:
+      return GL_RGBA;
+    case GL_RGBA8I:
+    case GL_RGBA8UI:
+    case GL_RGBA16I:
+    case GL_RGBA16UI:
+    case GL_RGBA32I:
+    case GL_RGBA32UI:
+      return GL_RGBA_INTEGER;
+
+    default:
+      guarantee(false, "Unexpected GL internal format.");
+      return 0;
+  }
+}
+
+void ImageGL::initDeviceMemory() {
+  deviceMemories_ =
+      reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(ImageGL));
+  memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory));
+}
+
+//*******************************************************************
+//
+// Internal implementation of CL API functions
+//
+//*******************************************************************
+
+//
+//      clCreateFromGLBufferAMD
+//
+cl_mem clCreateFromGLBufferAMD(Context& amdContext, cl_mem_flags flags, GLuint bufobj,
+                               cl_int* errcode_ret) {
+  BufferGL* pBufferGL = NULL;
+  GLenum glErr;
+  GLenum glTarget = GL_ARRAY_BUFFER;
+  GLint gliSize = 0;
+  GLint gliMapped = 0;
+
+  // Verify context init'ed for interop
+  if (!amdContext.glenv() || !amdContext.glenv()->isAssociated()) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("\"amdContext\" is not created from GL context or share list");
+    return (cl_mem)0;
+  }
+
+  // Add this scope to bound the scoped lock
+  {
+    GLFunctions::SetIntEnv ie(amdContext.glenv());
+    if (!ie.isValid()) {
+      *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+      LogWarning("\"amdContext\" is not created from GL context or share list");
+      return as_cl<Memory>(0);
+    }
+
+    // Verify GL buffer object
+    clearGLErrors(amdContext);
+    if ((GL_FALSE == amdContext.glenv()->glIsBuffer_(bufobj)) ||
+        (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("\"bufobj\" is not a GL buffer object");
+      return (cl_mem)0;
+    }
+
+    // It seems that CL spec is not concerned with GL_BUFFER_USAGE, so skip it
+
+    // Check if size is available - data store is created
+
+    amdContext.glenv()->glBindBuffer_(glTarget, bufobj);
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &gliSize);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("cannot get the GL buffer size");
+      return (cl_mem)0;
+    }
+    if (gliSize == 0) {
+      //@todo - check why sometime the size is zero
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("the GL buffer's data store is not created");
+      return (cl_mem)0;
+    }
+
+    // Mapping will be done at acquire time (sync point)
+
+  }  // Release scoped lock
+
+  // Now create BufferGL object
+  pBufferGL = new (amdContext) BufferGL(amdContext, flags, gliSize, 0, bufobj);
+
+  if (!pBufferGL) {
+    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
+    LogWarning("cannot create object of class BufferGL");
+    return (cl_mem)0;
+  }
+
+  if (!pBufferGL->create()) {
+    *not_null(errcode_ret) = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    pBufferGL->release();
+    return (cl_mem)0;
+  }
+
+  *not_null(errcode_ret) = CL_SUCCESS;
+
+  // Create interop object
+  if (pBufferGL->getInteropObj() == NULL) {
+    *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+    LogWarning("cannot create object of class BufferGL");
+    return (cl_mem)0;
+  }
+
+  // Fixme: If more than one device is present in the context, we choose the first device.
+  // We should come up with a more elegant solution to handle this.
+  assert(amdContext.devices().size() == 1);
+
+  const auto it = amdContext.devices().cbegin();
+  const amd::Device& dev = *(*it);
+
+  device::Memory* mem = pBufferGL->getDeviceMemory(dev);
+  if (NULL == mem) {
+    LogPrintfError("Can't allocate memory size - 0x%08X bytes!", pBufferGL->getSize());
+    *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+    return (cl_mem)0;
+  }
+  mem->processGLResource(device::Memory::GLDecompressResource);
+
+  return as_cl<Memory>(pBufferGL);
+}
+
+cl_mem clCreateFromGLTextureAMD(Context& amdContext, cl_mem_flags clFlags, GLenum target,
+                                GLint miplevel, GLuint texture, int* errcode_ret) {
+  ImageGL* pImageGL = NULL;
+  GLenum glErr;
+  GLenum glTarget = 0;
+  GLenum glInternalFormat;
+  cl_image_format clImageFormat;
+  uint dim = 1;
+  cl_mem_object_type clType;
+  cl_gl_object_type clGLType;
+  GLsizei numSamples = 1;
+
+  // Verify context init'ed for interop
+  if (!amdContext.glenv() || !amdContext.glenv()->isAssociated()) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("\"amdContext\" is not created from GL context or share list");
+    return static_cast<cl_mem>(0);
+  }
+
+  GLint gliTexWidth = 1;
+  GLint gliTexHeight = 1;
+  GLint gliTexDepth = 1;
+
+  // Add this scope to bound the scoped lock
+  {
+    GLFunctions::SetIntEnv ie(amdContext.glenv());
+    if (!ie.isValid()) {
+      *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+      LogWarning("\"amdContext\" is not created from GL context or share list");
+      return as_cl<Memory>(0);
+    }
+
+    // Verify GL texture object
+    clearGLErrors(amdContext);
+    if ((GL_FALSE == amdContext.glenv()->glIsTexture_(texture)) ||
+        (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("\"texture\" is not a GL texture object");
+      return static_cast<cl_mem>(0);
+    }
+
+    bool image = true;
+
+    // Check target value validity
+    switch (target) {
+      case GL_TEXTURE_BUFFER:
+        glTarget = GL_TEXTURE_BUFFER;
+        dim = 1;
+        clType = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        clGLType = CL_GL_OBJECT_TEXTURE_BUFFER;
+        image = false;
+        break;
+
+      case GL_TEXTURE_1D:
+        glTarget = GL_TEXTURE_1D;
+        dim = 1;
+        clType = CL_MEM_OBJECT_IMAGE1D;
+        clGLType = CL_GL_OBJECT_TEXTURE1D;
+        break;
+
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+        glTarget = GL_TEXTURE_CUBE_MAP;
+        dim = 2;
+        clType = CL_MEM_OBJECT_IMAGE2D;
+        clGLType = CL_GL_OBJECT_TEXTURE2D;
+        break;
+
+      case GL_TEXTURE_1D_ARRAY:
+        glTarget = GL_TEXTURE_1D_ARRAY;
+        dim = 2;
+        clType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+        clGLType = CL_GL_OBJECT_TEXTURE1D_ARRAY;
+        break;
+
+      case GL_TEXTURE_2D:
+        glTarget = GL_TEXTURE_2D;
+        dim = 2;
+        clType = CL_MEM_OBJECT_IMAGE2D;
+        clGLType = CL_GL_OBJECT_TEXTURE2D;
+        break;
+
+      case GL_TEXTURE_2D_MULTISAMPLE:
+        glTarget = GL_TEXTURE_2D_MULTISAMPLE;
+        dim = 2;
+        clType = CL_MEM_OBJECT_IMAGE2D;
+        clGLType = CL_GL_OBJECT_TEXTURE2D;
+        break;
+
+      case GL_TEXTURE_RECTANGLE_ARB:
+        glTarget = GL_TEXTURE_RECTANGLE_ARB;
+        dim = 2;
+        clType = CL_MEM_OBJECT_IMAGE2D;
+        clGLType = CL_GL_OBJECT_TEXTURE2D;
+        break;
+
+      case GL_TEXTURE_2D_ARRAY:
+        glTarget = GL_TEXTURE_2D_ARRAY;
+        dim = 3;
+        clType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+        clGLType = CL_GL_OBJECT_TEXTURE2D_ARRAY;
+        break;
+
+      case GL_TEXTURE_3D:
+        glTarget = GL_TEXTURE_3D;
+        dim = 3;
+        clType = CL_MEM_OBJECT_IMAGE3D;
+        clGLType = CL_GL_OBJECT_TEXTURE3D;
+        break;
+
+      default:
+        // wrong value
+        *not_null(errcode_ret) = CL_INVALID_VALUE;
+        LogWarning("invalid \"target\" value");
+        return static_cast<cl_mem>(0);
+        break;
+    }
+
+    amdContext.glenv()->glBindTexture_(glTarget, texture);
+
+    // Check if size is available - data store is created
+    if (image) {
+      // Check mipmap level for "texture" name
+      GLint gliTexBaseLevel;
+      GLint gliTexMaxLevel;
+
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_BASE_LEVEL, &gliTexBaseLevel);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_MIP_LEVEL;
+        LogWarning("Cannot get base mipmap level of a GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_MAX_LEVEL, &gliTexMaxLevel);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_MIP_LEVEL;
+        LogWarning("Cannot get max mipmap level of a GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+      if ((gliTexBaseLevel > miplevel) || (miplevel > gliTexMaxLevel)) {
+        *not_null(errcode_ret) = CL_INVALID_MIP_LEVEL;
+        LogWarning("\"miplevel\" is not a valid mipmap level of the GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+
+      // Get GL texture format and check if it's compatible with CL format
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_INTERNAL_FORMAT,
+                                                    (GLint*)&glInternalFormat);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+
+      amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_SAMPLES,
+                                                    (GLint*)&numSamples);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("Cannot get  numbers of samples of GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+      if (numSamples > 1) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("MSAA \"texture\" object is not suppoerted for the device");
+        return static_cast<cl_mem>(0);
+      }
+
+      // Now get CL format from GL format and bytes per pixel
+      int iBytesPerPixel = 0;
+      if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
+                             clFlags)) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("\"texture\" format does not map to an appropriate CL image format");
+        return static_cast<cl_mem>(0);
+      }
+
+      switch (dim) {
+        case 3:
+          clearGLErrors(amdContext);
+          amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_DEPTH,
+                                                        &gliTexDepth);
+          if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+            *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+            LogWarning("Cannot get the depth of \"miplevel\" of GL \"texure\"");
+            return static_cast<cl_mem>(0);
+          }
+        // Fall trough to process other dimensions...
+        case 2:
+          clearGLErrors(amdContext);
+          amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_HEIGHT,
+                                                        &gliTexHeight);
+          if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+            *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+            LogWarning("Cannot get the height of \"miplevel\" of GL \"texure\"");
+            return static_cast<cl_mem>(0);
+          }
+        // Fall trough to process other dimensions...
+        case 1:
+          clearGLErrors(amdContext);
+          amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_WIDTH,
+                                                        &gliTexWidth);
+          if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+            *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+            LogWarning("Cannot get the width of \"miplevel\" of GL \"texure\"");
+            return static_cast<cl_mem>(0);
+          }
+          break;
+        default:
+          *not_null(errcode_ret) = CL_INVALID_VALUE;
+          LogWarning("invalid \"target\" value");
+          return static_cast<cl_mem>(0);
+      }
+    } else {
+      GLint size;
+
+      // In case target is GL_TEXTURE_BUFFER
+      GLint backingBuffer;
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetTexLevelParameteriv_(
+          glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING, &backingBuffer);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("Cannot get backing buffer for GL \"texture buffer\" object");
+        return static_cast<cl_mem>(0);
+      }
+      amdContext.glenv()->glBindBuffer_(glTarget, backingBuffer);
+
+      // Get GL texture format and check if it's compatible with CL format
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetIntegerv_(GL_TEXTURE_BUFFER_FORMAT_EXT,
+                                         reinterpret_cast<GLint*>(&glInternalFormat));
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+
+      // Now get CL format from GL format and bytes per pixel
+      int iBytesPerPixel = 0;
+      if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
+                             clFlags)) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("\"texture\" format does not map to an appropriate CL image format");
+        return static_cast<cl_mem>(0);
+      }
+
+      clearGLErrors(amdContext);
+      amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &size);
+      if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+        *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+        LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+        return static_cast<cl_mem>(0);
+      }
+
+      gliTexWidth = size / iBytesPerPixel;
+    }
+    size_t imageSize = (clType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? static_cast<size_t>(gliTexHeight)
+                                                               : static_cast<size_t>(gliTexDepth);
+
+    if (!amd::Image::validateDimensions(
+            amdContext.devices(), clType, static_cast<size_t>(gliTexWidth),
+            static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), imageSize)) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("The GL \"texture\" data store is not created or out of supported dimensions");
+      return static_cast<cl_mem>(0);
+    }
+
+    // PBO and mapping will be done at "acquire" time (sync point)
+
+  }  // Release scoped lock
+
+  target = (glTarget == GL_TEXTURE_CUBE_MAP) ? target : 0;
+
+  pImageGL = new (amdContext)
+      ImageGL(amdContext, clType, clFlags, clImageFormat, static_cast<size_t>(gliTexWidth),
+              static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
+              texture, miplevel, glInternalFormat, clGLType, numSamples, target);
+
+  if (!pImageGL) {
+    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
+    LogWarning("Cannot create class ImageGL - out of memory?");
+    return static_cast<cl_mem>(0);
+  }
+
+  if (!pImageGL->create()) {
+    *not_null(errcode_ret) = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    pImageGL->release();
+    return static_cast<cl_mem>(0);
+  }
+
+  *not_null(errcode_ret) = CL_SUCCESS;
+  return as_cl<Memory>(pImageGL);
+}
+
+//
+//      clCreateFromGLRenderbufferDAMD
+//
+cl_mem clCreateFromGLRenderbufferAMD(Context& amdContext, cl_mem_flags clFlags, GLuint renderbuffer,
+                                     int* errcode_ret) {
+  ImageGL* pImageGL = NULL;
+  GLenum glErr;
+
+  GLenum glTarget = GL_RENDERBUFFER;
+  GLenum glInternalFormat;
+  cl_image_format clImageFormat;
+
+  // Verify context init'ed for interop
+  if (!amdContext.glenv() || !amdContext.glenv()->isAssociated()) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("\"amdContext\" is not created from GL context or share list");
+    return (cl_mem)0;
+  }
+
+  GLint gliRbWidth;
+  GLint gliRbHeight;
+
+  // Add this scope to bound the scoped lock
+  {
+    GLFunctions::SetIntEnv ie(amdContext.glenv());
+    if (!ie.isValid()) {
+      *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+      LogWarning("\"amdContext\" is not created from GL context or share list");
+      return as_cl<Memory>(0);
+    }
+
+    // Verify GL renderbuffer object
+    clearGLErrors(amdContext);
+    if ((GL_FALSE == amdContext.glenv()->glIsRenderbufferEXT_(renderbuffer)) ||
+        (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("\"renderbuffer\" is not a GL texture object");
+      return (cl_mem)0;
+    }
+
+    amdContext.glenv()->glBindRenderbuffer_(glTarget, renderbuffer);
+
+    // Get GL RB format and check if it's compatible with CL format
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetRenderbufferParameterivEXT_(glTarget, GL_RENDERBUFFER_INTERNAL_FORMAT,
+                                                         (GLint*)&glInternalFormat);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+      LogWarning("Cannot get internal format of GL \"renderbuffer\" object");
+      return (cl_mem)0;
+    }
+
+    // Now get CL format from GL format and bytes per pixel
+    int iBytesPerPixel = 0;
+    if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
+                           clFlags)) {
+      *not_null(errcode_ret) = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+      LogWarning("\"renderbuffer\" format does not map to an appropriate CL image format");
+      return (cl_mem)0;
+    }
+
+    // Check if size is available - data store is created
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetRenderbufferParameterivEXT_(glTarget, GL_RENDERBUFFER_WIDTH,
+                                                         &gliRbWidth);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("Cannot get the width of GL \"renderbuffer\"");
+      return (cl_mem)0;
+    }
+    if (gliRbWidth == 0) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("The GL \"renderbuffer\" data store is not created");
+      return (cl_mem)0;
+    }
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetRenderbufferParameterivEXT_(glTarget, GL_RENDERBUFFER_HEIGHT,
+                                                         &gliRbHeight);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("Cannot get the height of GL \"renderbuffer\"");
+      return (cl_mem)0;
+    }
+    if (gliRbHeight == 0) {
+      *not_null(errcode_ret) = CL_INVALID_GL_OBJECT;
+      LogWarning("The GL \"renderbuffer\" data store is not created");
+      return (cl_mem)0;
+    }
+
+    // PBO and mapping will be done at "acquire" time (sync point)
+
+  }  // Release scoped lock
+
+  pImageGL =
+      new (amdContext) ImageGL(amdContext, CL_MEM_OBJECT_IMAGE2D, clFlags, clImageFormat,
+                               (size_t)gliRbWidth, (size_t)gliRbHeight, 1, glTarget, renderbuffer,
+                               0, glInternalFormat, CL_GL_OBJECT_RENDERBUFFER, 0);
+
+  if (!pImageGL) {
+    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
+    LogWarning("Cannot create class ImageGL from renderbuffer - out of memory?");
+    return (cl_mem)0;
+  }
+
+  if (!pImageGL->create()) {
+    *not_null(errcode_ret) = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    pImageGL->release();
+    return (cl_mem)0;
+  }
+
+  *not_null(errcode_ret) = CL_SUCCESS;
+  return as_cl<Memory>(pImageGL);
+}
+
+//
+//      clEnqueueAcquireExtObjectsAMD
+//
+
+static cl_int clSetInteropObjects(cl_uint num_objects, const cl_mem* mem_objects,
+                                  std::vector<amd::Memory*>& interopObjects) {
+  if ((num_objects == 0 && mem_objects != NULL) || (num_objects != 0 && mem_objects == NULL)) {
+    return CL_INVALID_VALUE;
+  }
+
+  while (num_objects-- > 0) {
+    cl_mem obj = *mem_objects++;
+    if (!is_valid(obj)) {
+      return CL_INVALID_MEM_OBJECT;
+    }
+
+    amd::Memory* mem = as_amd(obj);
+    if (mem->getInteropObj() == NULL) {
+      return CL_INVALID_GL_OBJECT;
+    }
+
+    interopObjects.push_back(mem);
+  }
+  return CL_SUCCESS;
+}
+
+cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
+                                     const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+                                     const cl_event* event_wait_list, cl_event* event,
+                                     cl_command_type cmd_type) {
+  if (!is_valid(command_queue)) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+
+  amd::HostQueue* queue = as_amd(command_queue)->asHostQueue();
+  if (NULL == queue) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+  amd::HostQueue& hostQueue = *queue;
+
+  if (cmd_type == CL_COMMAND_ACQUIRE_GL_OBJECTS) {
+    // Verify context init'ed for interop
+    if (!hostQueue.context().glenv() || !hostQueue.context().glenv()->isAssociated()) {
+      LogWarning("\"amdContext\" is not created from GL context or share list");
+      return CL_INVALID_CONTEXT;
+    }
+  }
+
+  std::vector<amd::Memory*> memObjects;
+  cl_int err = clSetInteropObjects(num_objects, mem_objects, memObjects);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+
+  amd::Command::EventWaitList eventWaitList;
+  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
+                                event_wait_list);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+
+#ifdef _WIN32
+  if ((hostQueue.context().info().flags_ & amd::Context::InteropUserSync) == 0) {
+    //! Make sure D3D10 queues are flushed and all commands are finished
+    //! before CL side would access interop objects
+    if (cmd_type == CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR) {
+      SyncD3D10Objects(memObjects);
+    }
+    //! Make sure D3D11 queues are flushed and all commands are finished
+    //! before CL side would access interop objects
+    if (cmd_type == CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR) {
+      SyncD3D11Objects(memObjects);
+    }
+    //! Make sure D3D9 queues are flushed and all commands are finished
+    //! before CL side would access interop objects
+    if (cmd_type == CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR) {
+      SyncD3D9Objects(memObjects);
+    }
+  }
+#endif  //_WIN32
+
+  //! Now create command and enqueue
+  amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand(
+      hostQueue, eventWaitList, num_objects, memObjects, cmd_type);
+  if (command == NULL) {
+    return CL_OUT_OF_HOST_MEMORY;
+  }
+
+  // Make sure we have memory for the command execution
+  if (!command->validateMemory()) {
+    delete command;
+    return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  }
+
+  command->enqueue();
+
+  *not_null(event) = as_cl(&command->event());
+  if (event == NULL) {
+    command->release();
+  }
+  return CL_SUCCESS;
+}
+
+
+//
+//      clEnqueueReleaseExtObjectsAMD
+//
+cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
+                                     const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+                                     const cl_event* event_wait_list, cl_event* event,
+                                     cl_command_type cmd_type) {
+  if (!is_valid(command_queue)) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+
+  amd::HostQueue* queue = as_amd(command_queue)->asHostQueue();
+  if (NULL == queue) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+  amd::HostQueue& hostQueue = *queue;
+
+  std::vector<amd::Memory*> memObjects;
+  cl_int err = clSetInteropObjects(num_objects, mem_objects, memObjects);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+
+  amd::Command::EventWaitList eventWaitList;
+  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
+                                event_wait_list);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+
+  //! Now create command and enqueue
+  amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand(
+      hostQueue, eventWaitList, num_objects, memObjects, cmd_type);
+  if (command == NULL) {
+    return CL_OUT_OF_HOST_MEMORY;
+  }
+
+  // Make sure we have memory for the command execution
+  if (!command->validateMemory()) {
+    delete command;
+    return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  }
+
+  command->enqueue();
+
+#ifdef _WIN32
+  if ((hostQueue.context().info().flags_ & amd::Context::InteropUserSync) == 0) {
+    //! Make sure CL command queue is flushed and all commands are finished
+    //! before D3D10 side would access interop resources
+    if (cmd_type == CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR ||
+        cmd_type == CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR ||
+        cmd_type == CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR) {
+      command->awaitCompletion();
+    }
+  }
+#endif  //_WIN32
+
+  *not_null(event) = as_cl(&command->event());
+
+  if (event == NULL) {
+    command->release();
+  }
+
+  return CL_SUCCESS;
+}
+
+// Placed here as opposed to command.cpp, as glext.h and cl_gl_amd.hpp will have
+// to be included because of the GL calls
+bool ClGlEvent::waitForFence() {
+  GLenum ret;
+  // get fence id associated with fence event
+  GLsync gs = reinterpret_cast<GLsync>(command().data());
+  if (!gs) return false;
+
+// Try to use DC and GLRC of current thread, if it doesn't exist
+// create a new GL context on this thread, which is shared with the original context
+
+#ifdef _WIN32
+  HDC tempDC_ = wglGetCurrentDC();
+  HGLRC tempGLRC_ = wglGetCurrentContext();
+  // Set DC and GLRC
+  if (tempDC_ && tempGLRC_) {
+    ret = context().glenv()->glClientWaitSync_(gs, GL_SYNC_FLUSH_COMMANDS_BIT,
+                                               static_cast<GLuint64>(-1));
+    if (!(ret == GL_ALREADY_SIGNALED || ret == GL_CONDITION_SATISFIED)) return false;
+  } else {
+    tempDC_ = context().glenv()->getDC();
+    tempGLRC_ = context().glenv()->getIntGLRC();
+    if (!context().glenv()->init(reinterpret_cast<intptr_t>(tempDC_),
+                                 reinterpret_cast<intptr_t>(tempGLRC_)))
+      return false;
+
+    // Make the newly created GL context current to this thread
+    context().glenv()->setIntEnv();
+    // If fence has not yet executed, wait till it finishes
+    ret = context().glenv()->glClientWaitSync_(gs, GL_SYNC_FLUSH_COMMANDS_BIT,
+                                               static_cast<GLuint64>(-1));
+    if (!(ret == GL_ALREADY_SIGNALED || ret == GL_CONDITION_SATISFIED)) return false;
+    // Since we're done making GL calls, restore whatever context was previously current to this
+    // thread
+    context().glenv()->restoreEnv();
+  }
+#else  // Lnx
+  Display* tempDpy_ = context().glenv()->glXGetCurrentDisplay_();
+  GLXDrawable tempDrawable_ = context().glenv()->glXGetCurrentDrawable_();
+  GLXContext tempCtx_ = context().glenv()->glXGetCurrentContext_();
+  // Set internal Display and GLXContext
+  if (tempDpy_ && tempCtx_) {
+    ret = context().glenv()->glClientWaitSync_(gs, GL_SYNC_FLUSH_COMMANDS_BIT,
+                                               static_cast<GLuint64>(-1));
+    if (!(ret == GL_ALREADY_SIGNALED || ret == GL_CONDITION_SATISFIED)) return false;
+  } else {
+    if (!context().glenv()->init(reinterpret_cast<intptr_t>(context().glenv()->getIntDpy()),
+                                 reinterpret_cast<intptr_t>(context().glenv()->getIntCtx())))
+      return false;
+
+    // Make the newly created GL context current to this thread
+    context().glenv()->setIntEnv();
+    // If fence has not yet executed, wait till it finishes
+    ret = context().glenv()->glClientWaitSync_(gs, GL_SYNC_FLUSH_COMMANDS_BIT,
+                                               static_cast<GLuint64>(-1));
+    if (!(ret == GL_ALREADY_SIGNALED || ret == GL_CONDITION_SATISFIED)) return false;
+    // Since we're done making GL calls, restore whatever context was previously current to this
+    // thread
+    context().glenv()->restoreEnv();
+  }
+#endif
+  // If we reach this point, fence should have completed
+  setStatus(CL_COMPLETE);
+  return true;
+}
+
+//
+//  GLFunctions implementation
+//
+
+#ifdef _WIN32
+#define CONVERT_CHAR_GLUBYTE
+#else  //!_WIN32
+#define CONVERT_CHAR_GLUBYTE (GLubyte*)
+#endif  //!_WIN32
+
+#define GLPREFIX(rtype, fcn, dclargs)                                                              \
+  if (!(fcn##_ = (PFN_##fcn)GETPROCADDRESS(libHandle_, #fcn))) {                                   \
+    if (!(fcn##_ = (PFN_##fcn)GetProcAddress_(reinterpret_cast<FCN_STR_TYPE>(#fcn)))) ++missed_;   \
+  }
+
+GLFunctions::SetIntEnv::SetIntEnv(GLFunctions* env) : env_(env) {
+  env_->getLock().lock();
+
+  // Set environment (DC and GLRC)
+  isValid_ = env_->setIntEnv();
+}
+
+GLFunctions::SetIntEnv::~SetIntEnv() {
+  // Restore environment (CL DC and CL GLRC)
+  env_->restoreEnv();
+
+  env_->getLock().unlock();
+}
+
+GLFunctions::GLFunctions(HMODULE h, bool isEGL)
+    : libHandle_(h),
+      missed_(0),
+      eglDisplay_(EGL_NO_DISPLAY),
+      eglOriginalContext_(EGL_NO_CONTEXT),
+      eglInternalContext_(EGL_NO_CONTEXT),
+      eglTempContext_(EGL_NO_CONTEXT),
+      isEGL_(isEGL),
+#ifdef _WIN32
+      hOrigGLRC_(0),
+      hDC_(0),
+      hIntGLRC_(0)
+#else   //!_WIN32
+      Dpy_(0),
+      Drawable_(0),
+      origCtx_(0),
+      intDpy_(0),
+      intDrawable_(0),
+      intCtx_(0),
+      XOpenDisplay_(NULL),
+      XCloseDisplay_(NULL),
+      glXGetCurrentDrawable_(NULL),
+      glXGetCurrentDisplay_(NULL),
+      glXGetCurrentContext_(NULL),
+      glXChooseVisual_(NULL),
+      glXCreateContext_(NULL),
+      glXDestroyContext_(NULL),
+      glXMakeCurrent_(NULL)
+#endif  //!_WIN32
+{
+#define VERIFY_POINTER(p)                                                                          \
+  if (NULL == p) {                                                                                 \
+    missed_++;                                                                                     \
+  }
+
+  if (isEGL_) {
+    GetProcAddress_ = (PFN_xxxGetProcAddress)GETPROCADDRESS(h, "eglGetProcAddress");
+  } else {
+    GetProcAddress_ = (PFN_xxxGetProcAddress)GETPROCADDRESS(h, API_GETPROCADDR);
+  }
+#ifndef _WIN32
+  // Initialize pointers to X11/GLX functions
+  // We can not link with these functions on compile time since we need to support
+  // console mode. In console mode X server and X server components may be absent.
+  // Hence linking with X11 or libGL will fail module image loading in console mode.-tzachi cohen
+
+  if (!isEGL_) {
+    glXGetCurrentDrawable_ = (PFNglXGetCurrentDrawable)GETPROCADDRESS(h, "glXGetCurrentDrawable");
+    VERIFY_POINTER(glXGetCurrentDrawable_)
+    glXGetCurrentDisplay_ = (PFNglXGetCurrentDisplay)GETPROCADDRESS(h, "glXGetCurrentDisplay");
+    VERIFY_POINTER(glXGetCurrentDisplay_)
+    glXGetCurrentContext_ = (PFNglXGetCurrentContext)GETPROCADDRESS(h, "glXGetCurrentContext");
+    VERIFY_POINTER(glXGetCurrentContext_)
+    glXChooseVisual_ = (PFNglXChooseVisual)GETPROCADDRESS(h, "glXChooseVisual");
+    VERIFY_POINTER(glXChooseVisual_)
+    glXCreateContext_ = (PFNglXCreateContext)GETPROCADDRESS(h, "glXCreateContext");
+    VERIFY_POINTER(glXCreateContext_)
+    glXDestroyContext_ = (PFNglXDestroyContext)GETPROCADDRESS(h, "glXDestroyContext");
+    VERIFY_POINTER(glXDestroyContext_)
+    glXMakeCurrent_ = (PFNglXMakeCurrent)GETPROCADDRESS(h, "glXMakeCurrent");
+    VERIFY_POINTER(glXMakeCurrent_)
+
+    HMODULE hXModule = (HMODULE)Os::loadLibrary("libX11.so.6");
+    if (NULL != hXModule) {
+      XOpenDisplay_ = (PFNXOpenDisplay)GETPROCADDRESS(hXModule, "XOpenDisplay");
+      VERIFY_POINTER(XOpenDisplay_)
+      XCloseDisplay_ = (PFNXCloseDisplay)GETPROCADDRESS(hXModule, "XCloseDisplay");
+      VERIFY_POINTER(XCloseDisplay_)
+    } else {
+      missed_ += 2;
+    }
+  }
+// Initialize pointers to GL functions
+#include "gl_functions.hpp"
+#else
+  if (!isEGL_) {
+    wglCreateContext_ = (PFN_wglCreateContext)GETPROCADDRESS(h, "wglCreateContext");
+    VERIFY_POINTER(wglCreateContext_)
+    wglGetCurrentContext_ = (PFN_wglGetCurrentContext)GETPROCADDRESS(h, "wglGetCurrentContext");
+    VERIFY_POINTER(wglGetCurrentContext_)
+    wglGetCurrentDC_ = (PFN_wglGetCurrentDC)GETPROCADDRESS(h, "wglGetCurrentDC");
+    VERIFY_POINTER(wglGetCurrentDC_)
+    wglDeleteContext_ = (PFN_wglDeleteContext)GETPROCADDRESS(h, "wglDeleteContext");
+    VERIFY_POINTER(wglDeleteContext_)
+    wglMakeCurrent_ = (PFN_wglMakeCurrent)GETPROCADDRESS(h, "wglMakeCurrent");
+    VERIFY_POINTER(wglMakeCurrent_)
+    wglShareLists_ = (PFN_wglShareLists)GETPROCADDRESS(h, "wglShareLists");
+    VERIFY_POINTER(wglShareLists_)
+  }
+#endif
+}
+
+GLFunctions::~GLFunctions() {
+#ifdef _WIN32
+  if (hIntGLRC_) {
+    if (!wglDeleteContext_(hIntGLRC_)) {
+      DWORD dwErr = GetLastError();
+      LogWarning("Cannot delete GLRC");
+    }
+  }
+#else   //!_WIN32
+  if (intDpy_) {
+    if (intCtx_) {
+      glXDestroyContext_(intDpy_, intCtx_);
+      intCtx_ = NULL;
+    }
+    XCloseDisplay_(intDpy_);
+    intDpy_ = NULL;
+  }
+#endif  //!_WIN32
+}
+
+bool GLFunctions::init(intptr_t hdc, intptr_t hglrc) {
+  if (isEGL_) {
+    eglDisplay_ = (EGLDisplay)hdc;
+    eglOriginalContext_ = (EGLContext)hglrc;
+    return true;
+  }
+
+#ifdef _WIN32
+  DWORD err;
+
+  if (missed_) {
+    return false;
+  }
+
+  if (!hdc) {
+    hDC_ = wglGetCurrentDC_();
+  } else {
+    hDC_ = (HDC)hdc;
+  }
+  hOrigGLRC_ = (HGLRC)hglrc;
+  if (!(hIntGLRC_ = wglCreateContext_(hDC_))) {
+    err = GetLastError();
+    return false;
+  }
+  if (!wglShareLists_(hOrigGLRC_, hIntGLRC_)) {
+    err = GetLastError();
+    return false;
+  }
+
+  bool makeCurrentNull = false;
+
+  if (wglGetCurrentContext_() == NULL) {
+    wglMakeCurrent_(hDC_, hIntGLRC_);
+
+    makeCurrentNull = true;
+  }
+
+// Initialize pointers to GL functions
+#include "gl_functions.hpp"
+
+  if (makeCurrentNull) {
+    wglMakeCurrent_(NULL, NULL);
+  }
+
+  if (missed_ == 0) {
+    return true;
+  }
+#else  //!_WIN32
+  if (!missed_) {
+    if (!hdc) {
+      Dpy_ = glXGetCurrentDisplay_();
+    } else {
+      Dpy_ = (Display*)hdc;
+    }
+    Drawable_ = glXGetCurrentDrawable_();
+    origCtx_ = (GLXContext)hglrc;
+
+    int attribList[] = {GLX_RGBA, None};
+    if (!(intDpy_ = XOpenDisplay_(DisplayString(Dpy_)))) {
+#if defined(ATI_ARCH_X86)
+      asm("int $3");
+#endif
+    }
+    intDrawable_ = DefaultRootWindow(intDpy_);
+
+    XVisualInfo* vis;
+    int defaultScreen = DefaultScreen(intDpy_);
+    if (!(vis = glXChooseVisual_(intDpy_, defaultScreen, attribList))) {
+      return false;
+    }
+    if (!(intCtx_ = glXCreateContext_(intDpy_, vis, origCtx_, true))) {
+      return false;
+    }
+    return true;
+  }
+#endif  //!_WIN32
+  return false;
+}
+
+bool GLFunctions::setIntEnv() {
+  if (isEGL_) {
+    return true;
+  }
+#ifdef _WIN32
+  // Save current DC and GLRC
+  tempDC_ = wglGetCurrentDC_();
+  tempGLRC_ = wglGetCurrentContext_();
+  // Set internal DC and GLRC
+  if (tempDC_ != getDC() || tempGLRC_ != getIntGLRC()) {
+    if (!wglMakeCurrent_(getDC(), getIntGLRC())) {
+      DWORD err = GetLastError();
+      LogWarning("cannot set internal GL environment");
+      return false;
+    }
+  }
+#else   //!_WIN32
+  tempDpy_ = glXGetCurrentDisplay_();
+  tempDrawable_ = glXGetCurrentDrawable_();
+  tempCtx_ = glXGetCurrentContext_();
+  // Set internal Display and GLXContext
+  if (tempDpy_ != getDpy() || tempCtx_ != getIntCtx()) {
+    if (!glXMakeCurrent_(getIntDpy(), getIntDrawable(), getIntCtx())) {
+      LogWarning("cannot set internal GL environment");
+      return false;
+    }
+  }
+#endif  //!_WIN32
+
+  return true;
+}
+
+bool GLFunctions::restoreEnv() {
+  if (isEGL_) {
+    // eglMakeCurrent( );
+    return true;
+  }
+#ifdef _WIN32
+  // Restore original DC and GLRC
+  if (!wglMakeCurrent_(tempDC_, tempGLRC_)) {
+    DWORD err = GetLastError();
+    LogWarning("cannot restore original GL environment");
+    return false;
+  }
+#else   //!_WIN32
+  // Restore Display and GLXContext
+  if (tempDpy_) {
+    if (!glXMakeCurrent_(tempDpy_, tempDrawable_, tempCtx_)) {
+      LogWarning("cannot restore original GL environment");
+      return false;
+    }
+  } else {
+    // Just release internal context
+    if (!glXMakeCurrent_(getIntDpy(), None, NULL)) {
+      LogWarning("cannot reelase internal GL environment");
+      return false;
+    }
+  }
+#endif  //!_WIN32
+
+  return true;
+}
+
+}  // namespace amd
diff --git a/rocclr/cl_gl_amd.hpp b/rocclr/cl_gl_amd.hpp
new file mode 100644
index 0000000000..36831fa747
--- /dev/null
+++ b/rocclr/cl_gl_amd.hpp
@@ -0,0 +1,379 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CL_GL_AMD_HPP_
+#define CL_GL_AMD_HPP_
+
+#ifdef _WIN32
+#include <windows.h>
+#else //!_WIN32
+#include <dlfcn.h>
+#endif //!_WIN32
+
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "CL/cl_gl.h"
+#ifndef _WIN32
+#include <GL/glx.h>
+#endif //!_WIN32
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <EGL/eglplatform.h>
+
+#include "platform/context.hpp"
+#include "platform/command.hpp"
+
+namespace amd
+{
+
+//! Class GLObject keeps all the info about the GL object
+//! from which the CL object is created
+class GLObject : public InteropObject
+{
+protected:
+    cl_gl_object_type   clGLType_;  //!< CL GL object type
+    GLenum  glTarget_;
+    GLuint  gluiName_;
+    GLint   gliMipLevel_;
+    GLenum  glInternalFormat_;
+    GLint   gliWidth_;
+    GLint   gliHeight_;
+    GLint   gliDepth_;
+    GLenum  glCubemapFace_;
+    GLsizei glNumSamples_;
+
+public:
+//! GLObject constructor initializes member variables
+    GLObject(
+        GLenum  glTarget,
+        GLuint  gluiName,
+        GLint   gliMipLevel,
+        GLenum  glInternalFormat,
+        GLint   gliWidth,
+        GLint   gliHeight,
+        GLint   gliDepth,
+        cl_gl_object_type   clGLType,
+        GLenum  glCubemapFace,
+        GLsizei glNumSamples
+    ): // Initialization of member variables
+            clGLType_(clGLType),
+            glTarget_(glTarget),
+            gluiName_(gluiName),
+            gliMipLevel_(gliMipLevel),
+            glInternalFormat_(glInternalFormat),
+            gliWidth_(gliWidth),
+            gliHeight_(gliHeight),
+            gliDepth_(gliDepth),
+            glCubemapFace_(glCubemapFace),
+            glNumSamples_(glNumSamples)
+    {
+    }
+
+    virtual ~GLObject() {}
+    virtual GLObject* asGLObject() {return this;}
+
+//! GLObject query functions to get GL info from member variables
+    GLenum  getGLTarget() const {return glTarget_;}
+    GLuint  getGLName() const {return gluiName_;}
+    GLint   getGLMipLevel() const {return gliMipLevel_;}
+    GLenum  getGLInternalFormat() const {return glInternalFormat_;}
+    GLint   getGLSize() const {return gliWidth_;}
+    GLint   getGLWidth() const {return gliWidth_;}
+    GLint   getGLHeight() const {return gliHeight_;}
+    GLint   getGLDepth() const {return gliDepth_;}
+    cl_gl_object_type getCLGLObjectType() const { return clGLType_; }
+    GLenum  getCubemapFace() const {return glCubemapFace_;}
+    GLsizei getNumSamples() const { return glNumSamples_;}
+};
+
+
+//! Class BufferGL is drived from classes Buffer and GLObject
+//! where the former keeps all data for CL object and
+//! the latter keeps all data for GL object
+class BufferGL : public Buffer, public GLObject
+{
+protected:
+    //! Initializes the device memory array which is nested
+    // after'BufferGL' object in memory layout.
+    virtual void initDeviceMemory();
+public:
+//! BufferGL constructor just calls constructors of base classes
+//! to pass down the parameters
+    BufferGL(
+        Context&        amdContext,
+        cl_mem_flags    clFlags,
+        size_t          uiSizeInBytes,
+        GLenum          glTarget,
+        GLuint          gluiName)
+        : // Call base classes constructors
+            Buffer(
+                amdContext,
+                clFlags,
+                uiSizeInBytes
+            ),
+            GLObject(
+                glTarget,
+                gluiName,
+                0,                  // Mipmap level default
+                GL_ARRAY_BUFFER,    // Just init to some value
+                (GLint) uiSizeInBytes,
+                1,
+                1,
+                CL_GL_OBJECT_BUFFER,
+                0,
+                0
+            )
+    {
+        setInteropObj(this);
+    }
+    virtual ~BufferGL() {}
+
+    virtual BufferGL* asBufferGL() { return this; }
+};
+
+
+//! Class ImageGL is derived from classes Image and GLObject
+//! where the former keeps all data for CL object and
+//! the latter keeps all data for GL object
+class ImageGL : public Image, public GLObject
+{
+public:
+    //! ImageGL constructor just calls constructors of base classes
+    //! to pass down the parameters
+    ImageGL(
+        Context&            amdContext,
+        cl_mem_object_type  clType,
+        cl_mem_flags        clFlags,
+        const Format&       format,
+        size_t              width,
+        size_t              height,
+        size_t              depth,
+        GLenum              glTarget,
+        GLuint              gluiName,
+        GLint               gliMipLevel,
+        GLenum              glInternalFormat,
+        cl_gl_object_type   clGLType,
+        GLsizei             numSamples,
+        GLenum              glCubemapFace = 0)
+        : Image(amdContext, clType, clFlags, format, width, height, depth,
+            Format(format).getElementSize() * width,    
+            Format(format).getElementSize() * width * depth)
+        , GLObject(glTarget, gluiName, gliMipLevel, glInternalFormat,
+            static_cast<GLint>(width), static_cast<GLint>(height),
+            static_cast<GLint>(depth), clGLType, glCubemapFace,numSamples)
+    {
+        setInteropObj(this);
+    }
+
+    virtual ~ImageGL() {}
+
+protected:
+    //! Initializes the device memory array which is nested
+    // after'BufferGL' object in memory layout.
+    virtual void initDeviceMemory();
+};
+
+#ifdef _WIN32
+#define APICALL WINAPI
+#define GETPROCADDRESS      GetProcAddress
+#define API_GETPROCADDR     "wglGetProcAddress"
+#define FCN_STR_TYPE        LPCSTR
+    typedef PROC (WINAPI* PFN_xxxGetProcAddress) (LPCSTR fcnName);
+    typedef HGLRC (APICALL* PFN_wglCreateContext) (HDC hdc);
+    typedef HGLRC (APICALL* PFN_wglGetCurrentContext) (void);
+    typedef HDC   (APICALL* PFN_wglGetCurrentDC) (void);
+    typedef BOOL  (APICALL* PFN_wglDeleteContext) (HGLRC hglrc);
+    typedef BOOL  (APICALL* PFN_wglMakeCurrent) (HDC hdc, HGLRC hglrc);
+    typedef BOOL  (APICALL* PFN_wglShareLists) (HGLRC hglrc1, HGLRC hglrc2);
+#else //!_WIN32
+#define APICALL // __stdcall   //??? todo odintsov
+#define API_GETPROCADDR     "glXGetProcAddress"
+#define GETPROCADDRESS      dlsym
+#define FCN_STR_TYPE        const GLubyte*
+#define WINAPI
+#define PROC void*
+    typedef void* (*PFN_xxxGetProcAddress) (const GLubyte* procName);
+    // X11 typedef
+    typedef Display* (*PFNXOpenDisplay)(_Xconst char* display_name );
+    typedef int (*PFNXCloseDisplay)(Display* display );
+
+    //glx typedefs
+    typedef GLXDrawable (*PFNglXGetCurrentDrawable)();
+    typedef Display* (*PFNglXGetCurrentDisplay)();
+    typedef GLXContext (*PFNglXGetCurrentContext)( void );
+    typedef XVisualInfo* (*PFNglXChooseVisual)(Display *dpy, int screen, int *attribList);
+    typedef GLXContext(*PFNglXCreateContext)(Display* dpy,XVisualInfo* vis,GLXContext shareList,Bool direct);
+    typedef void(*PFNglXDestroyContext)(Display* dpy, GLXContext ctx);
+    typedef Bool(*PFNglXMakeCurrent)( Display* dpy, GLXDrawable drawable, GLXContext ctx);
+    typedef void* HMODULE;
+#endif //!_WIN32
+
+#define GLPREFIX(rtype, fcn, dclargs) \
+    typedef rtype (APICALL* PFN_##fcn) dclargs;
+
+// Declare prototypes for GL functions
+#include "gl_functions.hpp"
+
+class GLFunctions
+{
+public:
+    //! Locks any access to the virtual GPUs
+    class SetIntEnv : public amd::StackObject {
+    public:
+        //! Default constructor
+        SetIntEnv(GLFunctions* env);
+
+        //! Destructor
+        ~SetIntEnv();
+
+        //! Checks if the environment setup was successful
+        bool isValid() const { return isValid_; }
+
+    private:
+        GLFunctions*    env_;       //!< GL environment
+        bool            isValid_;   //!< If TRUE, then it's a valid setup
+    };
+
+private:
+    HMODULE libHandle_;
+    int missed_;    // Indicates how many GL functions not init'ed, if any
+
+    amd::Monitor lock_;
+
+    EGLDisplay eglDisplay_;
+    EGLContext eglOriginalContext_;
+    EGLContext eglInternalContext_;
+    EGLContext eglTempContext_;
+    bool isEGL_;
+
+#ifdef _WIN32
+    HGLRC       hOrigGLRC_;
+    HDC         hDC_;
+    HGLRC       hIntGLRC_;  // handle for internal GLRC to access shared context
+    HDC         tempDC_;
+    HGLRC       tempGLRC_;
+
+    PFN_wglCreateContext     wglCreateContext_;
+    PFN_wglGetCurrentContext wglGetCurrentContext_;
+    PFN_wglGetCurrentDC      wglGetCurrentDC_;
+    PFN_wglDeleteContext     wglDeleteContext_;
+    PFN_wglMakeCurrent       wglMakeCurrent_;
+    PFN_wglShareLists        wglShareLists_;
+#else
+public:
+    Display*    Dpy_;
+    GLXDrawable Drawable_;
+    GLXContext  origCtx_;
+    Display*    intDpy_;
+    Window      intDrawable_;
+    GLXContext  intCtx_;
+    Display*    tempDpy_;
+    GLXDrawable tempDrawable_;
+    GLXContext  tempCtx_;
+
+    //pointers to X11 functions
+    PFNXOpenDisplay XOpenDisplay_;
+    PFNXCloseDisplay XCloseDisplay_;
+
+    //pointers to GLX functions
+    PFNglXGetCurrentDrawable glXGetCurrentDrawable_;
+    PFNglXGetCurrentDisplay glXGetCurrentDisplay_;
+    PFNglXGetCurrentContext glXGetCurrentContext_;
+    PFNglXChooseVisual glXChooseVisual_;
+    PFNglXCreateContext glXCreateContext_;
+    PFNglXDestroyContext glXDestroyContext_;
+    PFNglXMakeCurrent glXMakeCurrent_;
+#endif
+public:
+
+    GLFunctions(HMODULE h, bool isEGL);
+    ~GLFunctions();
+
+    // Query CL-GL context association
+    bool isAssociated() const
+    {
+        if (isEGL_ && eglDisplay_ && eglOriginalContext_) return true;
+#ifdef _WIN32
+        if(hDC_ && hOrigGLRC_) return true;
+#else //!_WIN32
+        if(Dpy_ && origCtx_) return true;
+#endif //!_WIN32
+        return false;
+    }
+    bool isEGL() const
+    {
+        return isEGL_;
+    }
+    // Accessor methods
+#ifdef _WIN32
+    HGLRC getOrigGLRC() const {return hOrigGLRC_;}
+    HDC getDC() const {return hDC_;}
+    HGLRC getIntGLRC() const {return hIntGLRC_;}
+#else //!_WIN32
+    Display* getDpy() const {return Dpy_;}
+    GLXDrawable getDrawable() const {return Drawable_;}
+    GLXContext getOrigCtx() const {return origCtx_;}
+
+    Display* getIntDpy() const {return intDpy_;}
+    GLXDrawable getIntDrawable() const {return intDrawable_;}
+    GLXContext getIntCtx() const {return intCtx_;}
+
+    EGLDisplay getEglDpy() const { return eglDisplay_; }
+    EGLContext getEglOrigCtx() const { return eglOriginalContext_; }
+#endif //!_WIN32
+
+    // Initialize GL dynamic library and function pointers
+    bool init(intptr_t hdc, intptr_t hglrc);
+
+    // Return true if successful, false - if error occurred
+    bool setIntEnv();
+    bool restoreEnv();
+
+    amd::Monitor& getLock() { return lock_; }
+
+    PFN_xxxGetProcAddress GetProcAddress_;
+
+#define GLPREFIX(rtype, fcn, dclargs)   \
+    PFN_##fcn fcn##_;
+// Declare pointers to GL functions
+#include "gl_functions.hpp"
+};
+
+//! Functions for executing the GL related stuff
+cl_mem clCreateFromGLBufferAMD(Context& amdContext, cl_mem_flags flags,
+    GLuint bufobj, cl_int* errcode_ret);
+cl_mem clCreateFromGLTextureAMD(Context& amdContext, cl_mem_flags flags,
+    GLenum target, GLint miplevel, GLuint texture, int* errcode_ret);
+cl_mem clCreateFromGLRenderbufferAMD(Context& amdContext, cl_mem_flags flags,
+    GLuint renderbuffer, int* errcode_ret);
+
+bool
+getCLFormatFromGL(
+    const Context& amdContext,
+    GLint gliInternalFormat,
+    cl_image_format* pclImageFormat,
+    int* piBytesPerPixel,
+    cl_mem_flags flags
+);
+
+} //namespace amd
+
+#endif //CL_GL_AMD_HPP_
diff --git a/rocclr/cl_lqdflash_amd.cpp b/rocclr/cl_lqdflash_amd.cpp
new file mode 100644
index 0000000000..470e4bc5e9
--- /dev/null
+++ b/rocclr/cl_lqdflash_amd.cpp
@@ -0,0 +1,312 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "cl_common.hpp"
+#include <CL/cl_ext.h>
+
+#include "platform/object.hpp"
+
+#include "cl_lqdflash_amd.h"
+
+#ifndef WITH_LIQUID_FLASH
+#if (!defined(BUILD_HSA_TARGET) && defined(WITH_HSA_DEVICE) && \
+      defined(WITH_AMDGPU_PRO)) || defined(_WIN32)
+#define WITH_LIQUID_FLASH 1
+#endif  // _WIN32
+#endif
+
+#if WITH_LIQUID_FLASH
+#include "lf.h"
+#include <locale>
+#include <codecvt>
+#endif  // WITH_LIQUID_FLASH
+
+namespace amd {
+
+LiquidFlashFile::~LiquidFlashFile() { close(); }
+
+bool LiquidFlashFile::open() {
+#if WITH_LIQUID_FLASH
+  lf_status err;
+  lf_file_flags flags = 0;
+
+  switch (flags_) {
+    case CL_FILE_READ_ONLY_AMD:
+      flags = LF_READ;
+      break;
+    case CL_FILE_WRITE_ONLY_AMD:
+      flags = LF_WRITE;
+      break;
+    case CL_FILE_READ_WRITE_AMD:
+      flags = LF_READ | LF_WRITE;
+      break;
+  }
+#ifdef ATI_OS_LINUX
+  assert(sizeof(wchar_t) != sizeof(lf_char));
+  std::string name_char;
+  std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
+  name_char = cv.to_bytes(name_);
+  handle_ = lfOpenFile(name_char.c_str(), flags, &err);
+#else
+  handle_ = lfOpenFile(name_.c_str(), flags, &err);
+#endif
+
+  if (err != lf_success) {
+    return false;
+  }
+
+  if (lfGetFileBlockSize((lf_file)handle_, &blockSize_) != lf_success) {
+    return false;
+  }
+
+  if (lfGetFileSize((lf_file)handle_, &fileSize_) != lf_success) {
+    return false;
+  }
+  return true;
+#else
+  return false;
+#endif  // WITH_LIQUID_FLASH
+}
+
+void LiquidFlashFile::close() {
+#if WITH_LIQUID_FLASH
+  if (handle_ != NULL) {
+    lfReleaseFile((lf_file)handle_);
+    handle_ = NULL;
+  }
+#endif  // WITH_LIQUID_FLASH
+}
+
+bool LiquidFlashFile::transferBlock(bool writeBuffer, void* srcDst, uint64_t bufferSize,
+                                    uint64_t fileOffset, uint64_t bufferOffset,
+                                    uint64_t size) const {
+#if WITH_LIQUID_FLASH
+  lf_status status;
+
+  lf_region_descriptor region = {fileOffset / blockSize(), bufferOffset / blockSize(),
+                                 size / blockSize()};
+  if (writeBuffer) {
+    status = lfReadFile(srcDst, bufferSize, (lf_file)handle_, 1, &region, NULL);
+  } else {
+    status = lfWriteFile(srcDst, bufferSize, (lf_file)handle_, 1, &region, NULL);
+  }
+  if (lf_success == status) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif  // WITH_LIQUID_FLASH
+}
+
+}  // namespace amd
+
+/*! \addtogroup API
+ *  @{
+ *
+ *  \addtogroup AMD_Extensions
+ *  @{
+ *
+ */
+
+RUNTIME_ENTRY_RET(cl_file_amd, clCreateSsgFileObjectAMD,
+                  (cl_context context, cl_file_flags_amd flags, const wchar_t* file_name,
+                   cl_int* errcode_ret)) {
+#if WITH_LIQUID_FLASH && defined ATI_OS_LINUX
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return (cl_file_amd)0;
+  }
+
+  const std::vector<amd::Device*>& devices = as_amd(context)->devices();
+  bool supportPass = false;
+  for (auto& dev : devices) {
+    if (lf_success == lfCheckExtensionSupportForDevice(dev->info().pcieDeviceId_,
+                                                       dev->info().pcieRevisionId_)) {
+      supportPass = true;
+      break;
+    }
+  }
+  if (!supportPass) {
+    *not_null(errcode_ret) = CL_INVALID_DEVICE;
+    LogWarning("SSG isn't supported");
+    return (cl_file_amd)0;
+  }
+#endif
+  amd::LiquidFlashFile* file = new amd::LiquidFlashFile(file_name, flags);
+
+  if (file == NULL) {
+    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
+    return (cl_file_amd)0;
+  }
+
+  if (!file->open()) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    delete file;
+    return (cl_file_amd)0;
+  }
+
+  *not_null(errcode_ret) = CL_SUCCESS;
+  return as_cl(file);
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clGetSsgFileObjectInfoAMD,
+              (cl_file_amd file, cl_file_info_amd param_name, size_t param_value_size,
+               void* param_value, size_t* param_value_size_ret)) {
+  if (!is_valid(file)) {
+    return CL_INVALID_FILE_OBJECT_AMD;
+  }
+
+  switch (param_name) {
+    case CL_FILE_BLOCK_SIZE_AMD: {
+      cl_uint blockSize = as_amd(file)->blockSize();
+      return amd::clGetInfo(blockSize, param_value_size, param_value, param_value_size_ret);
+    }
+    case CL_FILE_SIZE_AMD: {
+      cl_ulong fileSize = as_amd(file)->fileSize();
+      return amd::clGetInfo(fileSize, param_value_size, param_value, param_value_size_ret);
+    }
+    default:
+      break;
+  }
+
+  return CL_INVALID_VALUE;
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clRetainSsgFileObjectAMD, (cl_file_amd file)) {
+  if (!is_valid(file)) {
+    return CL_INVALID_FILE_OBJECT_AMD;
+  }
+  as_amd(file)->retain();
+  return CL_SUCCESS;
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clReleaseSsgFileObjectAMD, (cl_file_amd file)) {
+  if (!is_valid(file)) {
+    return CL_INVALID_FILE_OBJECT_AMD;
+  }
+  as_amd(file)->release();
+  return CL_SUCCESS;
+}
+RUNTIME_EXIT
+
+static cl_int EnqueueTransferBufferFromSsgFileAMD(
+    cl_bool isWrite, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t buffer_offset, size_t cb, cl_file_amd file, size_t file_offset,
+    cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) {
+  if (!is_valid(command_queue)) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+
+  if (!is_valid(buffer)) {
+    return CL_INVALID_MEM_OBJECT;
+  }
+  amd::Buffer* pBuffer = as_amd(buffer)->asBuffer();
+  if (pBuffer == NULL) {
+    return CL_INVALID_MEM_OBJECT;
+  }
+
+  if (pBuffer->getMemFlags() & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    return CL_INVALID_OPERATION;
+  }
+
+  amd::HostQueue* queue = as_amd(command_queue)->asHostQueue();
+  if (NULL == queue) {
+    return CL_INVALID_COMMAND_QUEUE;
+  }
+  amd::HostQueue& hostQueue = *queue;
+
+  if (hostQueue.context() != pBuffer->getContext()) {
+    return CL_INVALID_CONTEXT;
+  }
+
+  if (!is_valid(file)) {
+    return CL_INVALID_FILE_OBJECT_AMD;
+  }
+
+  amd::LiquidFlashFile* amdFile = as_amd(file);
+  amd::Coord3D bufferOffset(buffer_offset, 0, 0);
+  amd::Coord3D bufferSize(cb, 1, 1);
+
+  if ((!pBuffer->validateRegion(bufferOffset, bufferSize)) ||
+      // LF library supports aligned sizes only
+      ((buffer_offset % amdFile->blockSize()) != 0) || ((cb % amdFile->blockSize()) != 0) ||
+      ((file_offset % amdFile->blockSize()) != 0)) {
+    return CL_INVALID_VALUE;
+  }
+
+  amd::Command::EventWaitList eventWaitList;
+  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
+                                       event_wait_list);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+
+  amd::TransferBufferFileCommand* command;
+  command = new amd::TransferBufferFileCommand(
+      isWrite ? CL_COMMAND_READ_SSG_FILE_AMD : CL_COMMAND_WRITE_SSG_FILE_AMD, hostQueue,
+      eventWaitList, *pBuffer, bufferOffset, bufferSize, amdFile, file_offset);
+
+  if (command == NULL) {
+    return CL_OUT_OF_HOST_MEMORY;
+  }
+
+  // Make sure we have memory for the command execution
+  if (!command->validateMemory()) {
+    delete command;
+    return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  }
+
+  command->enqueue();
+  if (blocking_write) {
+    command->awaitCompletion();
+  }
+
+  *not_null(event) = as_cl(&command->event());
+  if (event == NULL) {
+    command->release();
+  }
+  return CL_SUCCESS;
+}
+
+RUNTIME_ENTRY(cl_int, clEnqueueReadSsgFileAMD,
+              (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+               size_t buffer_offset, size_t cb, cl_file_amd file, size_t file_offset,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return EnqueueTransferBufferFromSsgFileAMD(CL_TRUE, command_queue, buffer, blocking_write,
+                                             buffer_offset, cb, file, file_offset,
+                                             num_events_in_wait_list, event_wait_list, event);
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clEnqueueWriteSsgFileAMD,
+              (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+               size_t buffer_offset, size_t cb, cl_file_amd file, size_t file_offset,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return EnqueueTransferBufferFromSsgFileAMD(CL_FALSE, command_queue, buffer, blocking_write,
+                                             buffer_offset, cb, file, file_offset,
+                                             num_events_in_wait_list, event_wait_list, event);
+}
+RUNTIME_EXIT
diff --git a/rocclr/cl_lqdflash_amd.h b/rocclr/cl_lqdflash_amd.h
new file mode 100644
index 0000000000..5a3e725b4c
--- /dev/null
+++ b/rocclr/cl_lqdflash_amd.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __CL_LQDFLASH_AMD_H
+#define __CL_LQDFLASH_AMD_H
+
+#include "CL/cl_ext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+extern CL_API_ENTRY cl_file_amd CL_API_CALL
+clCreateSsgFileObjectAMD(cl_context context, cl_file_flags_amd flags, const wchar_t* file_name,
+                         cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clGetSsgFileObjectInfoAMD(
+    cl_file_amd file, cl_file_info_amd param_name, size_t param_value_size, void* param_value,
+    size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clRetainSsgFileObjectAMD(cl_file_amd file)
+    CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clReleaseSsgFileObjectAMD(cl_file_amd file)
+    CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadSsgFileAMD(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t buffer_offset,
+    size_t cb, cl_file_amd file, size_t file_offset, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteSsgFileAMD(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t buffer_offset,
+    size_t cb, cl_file_amd file, size_t file_offset, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif /*__cplusplus*/
+
+#endif
diff --git a/rocclr/fixme.cpp b/rocclr/fixme.cpp
new file mode 100644
index 0000000000..90f034f63e
--- /dev/null
+++ b/rocclr/fixme.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "vdi_common.hpp"
+#include <icd/loader/icd_dispatch.h>
+
+cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {0};
+amd::PlatformIDS amd::PlatformID::Platform = {amd::ICDDispatchedObject::icdVendorDispatch_};
+
+RUNTIME_ENTRY(cl_int, clGetDeviceIDs,
+              (cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+               cl_device_id* devices, cl_uint* num_devices)) {
+  return CL_SUCCESS;
+}
+RUNTIME_EXIT
diff --git a/rocclr/hip_activity.cpp b/rocclr/hip_activity.cpp
new file mode 100644
index 0000000000..d3ce84bfd9
--- /dev/null
+++ b/rocclr/hip_activity.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "platform/activity.hpp"
+
+extern "C" void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg) {
+  activity_prof::CallbacksTable::init(reinterpret_cast<activity_prof::id_callback_fun_t>(id_callback),
+                                      reinterpret_cast<activity_prof::callback_fun_t>(op_callback),
+                                      arg);
+}
+
+extern "C" bool hipEnableActivityCallback(unsigned op, bool enable) {
+  return activity_prof::CallbacksTable::SetEnabled(op, enable);
+}
+
+extern "C" const char* hipGetCmdName(unsigned op) {
+  return getOclCommandKindString(static_cast<uint32_t>(op));
+}
diff --git a/rocclr/hip_code_object.cpp b/rocclr/hip_code_object.cpp
new file mode 100755
index 0000000000..9881b79c3e
--- /dev/null
+++ b/rocclr/hip_code_object.cpp
@@ -0,0 +1,782 @@
+/*
+Copyright (c) 2015-2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "hip_code_object.hpp"
+#include "amd_hsa_elf.hpp"
+
+#include <cstring>
+
+#include <hip/amd_detail/driver_types.h>
+#include "hip/hip_runtime_api.h"
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+#include <elf/elf.hpp>
+
+hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                      amd::HostQueue& queue, bool isAsync = false);
+namespace {
+size_t constexpr strLiteralLength(char const* str) {
+  return *str ? 1 + strLiteralLength(str + 1) : 0;
+}
+constexpr char const* CLANG_OFFLOAD_BUNDLER_MAGIC_STR = "__CLANG_OFFLOAD_BUNDLE__";
+constexpr char const* OFFLOAD_KIND_HIP = "hip";
+constexpr char const* OFFLOAD_KIND_HIPV4 = "hipv4";
+constexpr char const* OFFLOAD_KIND_HCC = "hcc";
+constexpr char const* AMDGCN_TARGET_TRIPLE = "amdgcn-amd-amdhsa-";
+
+// ClangOFFLOADBundle info.
+static constexpr size_t bundle_magic_string_size =
+    strLiteralLength(CLANG_OFFLOAD_BUNDLER_MAGIC_STR);
+
+// Clang Offload bundler description & Header.
+struct __ClangOffloadBundleInfo {
+  uint64_t offset;
+  uint64_t size;
+  uint64_t bundleEntryIdSize;
+  const char bundleEntryId[1];
+};
+
+struct __ClangOffloadBundleHeader {
+  const char magic[bundle_magic_string_size - 1];
+  uint64_t numOfCodeObjects;
+  __ClangOffloadBundleInfo desc[1];
+};
+}  // namespace
+
+namespace hip {
+
+uint64_t CodeObject::ElfSize(const void *emi) {
+  return amd::Elf::getElfSize(emi);
+}
+
+static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupported,
+                        bool& sramEccSupported) {
+  switch (EFlags & EF_AMDGPU_MACH) {
+    case EF_AMDGPU_MACH_AMDGCN_GFX700:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx700";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX701:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx701";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX702:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx702";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX703:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx703";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX704:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx704";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX705:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx705";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX801:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx801";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX802:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx802";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX803:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx803";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX805:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx805";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX810:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx810";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX900:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx900";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX902:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx902";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX904:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx904";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX906:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx906";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX908:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx908";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX909:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx909";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX90A:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx90a";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX90C:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx90c";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1010:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1010";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1011:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1011";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1012:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1012";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1030";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1031:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1031";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1032:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1032";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1033:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1033";
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+static bool getTripleTargetIDFromCodeObject(const void* code_object, std::string& target_id,
+                                            unsigned& co_version) {
+  if (!code_object) return false;
+  const Elf64_Ehdr* ehdr = reinterpret_cast<const Elf64_Ehdr*>(code_object);
+  if (ehdr->e_machine != EM_AMDGPU) return false;
+  if (ehdr->e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA) return false;
+
+  bool isXnackSupported{false}, isSramEccSupported{false};
+
+  std::string proc_name;
+  if (!getProcName(ehdr->e_flags, proc_name, isXnackSupported, isSramEccSupported)) return false;
+  target_id = std::string(AMDGCN_TARGET_TRIPLE) + '-' + proc_name;
+
+  switch (ehdr->e_ident[EI_ABIVERSION]) {
+    case ELFABIVERSION_AMDGPU_HSA_V2: {
+      co_version = 2;
+      return false;
+    }
+
+    case ELFABIVERSION_AMDGPU_HSA_V3: {
+      co_version = 3;
+      if (isSramEccSupported) {
+        if (ehdr->e_flags & EF_AMDGPU_FEATURE_SRAMECC_V3)
+          target_id += ":sramecc+";
+        else
+          target_id += ":sramecc-";
+      }
+      if (isXnackSupported) {
+        if (ehdr->e_flags & EF_AMDGPU_FEATURE_XNACK_V3)
+          target_id += ":xnack+";
+        else
+          target_id += ":xnack-";
+      }
+      break;
+    }
+
+    case ELFABIVERSION_AMDGPU_HSA_V4: {
+      co_version = 4;
+      unsigned co_sram_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_SRAMECC_V4;
+      if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_OFF_V4)
+        target_id += ":sramecc-";
+      else if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+        target_id += ":sramecc+";
+
+      unsigned co_xnack_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_XNACK_V4;
+      if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_OFF_V4)
+        target_id += ":xnack-";
+      else if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_ON_V4)
+        target_id += ":xnack+";
+      break;
+    }
+
+    default: {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Consumes the string 'consume_' from the starting of the given input
+// eg: input = amdgcn-amd-amdhsa--gfx908 and consume_ is amdgcn-amd-amdhsa--
+// input will become gfx908.
+static bool consume(std::string& input, std::string consume_) {
+  if (input.substr(0, consume_.size()) != consume_) {
+    return false;
+  }
+  input = input.substr(consume_.size());
+  return true;
+}
+
+// Trim String till character, will be used to get gpuname
+// example: input is gfx908:sram-ecc+ and trim char is :
+// input will become sram-ecc+.
+static std::string trimName(std::string& input, char trim) {
+  auto pos_ = input.find(trim);
+  auto res = input;
+  if (pos_ == std::string::npos) {
+    input = "";
+  } else {
+    res = input.substr(0, pos_);
+    input = input.substr(pos_);
+  }
+  return res;
+}
+
+static char getFeatureValue(std::string& input, std::string feature) {
+  char res = ' ';
+  if (consume(input, std::move(feature))) {
+    res = input[0];
+    input = input.substr(1);
+  }
+  return res;
+}
+
+static bool getTargetIDValue(std::string& input, std::string& processor, char& sramecc_value,
+                             char& xnack_value) {
+  processor = trimName(input, ':');
+  sramecc_value = getFeatureValue(input, std::string(":sramecc"));
+  if (sramecc_value != ' ' && sramecc_value != '+' && sramecc_value != '-') return false;
+  xnack_value = getFeatureValue(input, std::string(":xnack"));
+  if (xnack_value != ' ' && xnack_value != '+' && xnack_value != '-') return false;
+  return true;
+}
+
+static bool getTripleTargetID(std::string bundled_co_entry_id, const void* code_object,
+                              std::string& co_triple_target_id, unsigned& co_version) {
+  std::string offload_kind = trimName(bundled_co_entry_id, '-');
+  if (offload_kind != OFFLOAD_KIND_HIPV4 && offload_kind != OFFLOAD_KIND_HIP &&
+      offload_kind != OFFLOAD_KIND_HCC)
+    return false;
+
+  if (offload_kind != OFFLOAD_KIND_HIPV4)
+    return getTripleTargetIDFromCodeObject(code_object, co_triple_target_id, co_version);
+
+  // For code object V4 onwards the bundled code object entry ID correctly
+  // specifies the target tripple.
+  co_version = 4;
+  co_triple_target_id = bundled_co_entry_id.substr(1);
+  return true;
+}
+
+static bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
+                                             std::string agent_triple_target_id) {
+  // Primitive Check
+  if (co_triple_target_id == agent_triple_target_id) return true;
+
+  // Parse code object triple target id
+  if (!consume(co_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
+    return false;
+  }
+
+  std::string co_processor;
+  char co_sram_ecc, co_xnack;
+  if (!getTargetIDValue(co_triple_target_id, co_processor, co_sram_ecc, co_xnack)) {
+    return false;
+  }
+
+  if (!co_triple_target_id.empty()) return false;
+
+  // Parse agent isa triple target id
+  if (!consume(agent_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
+    return false;
+  }
+
+  std::string agent_isa_processor;
+  char isa_sram_ecc, isa_xnack;
+  if (!getTargetIDValue(agent_triple_target_id, agent_isa_processor, isa_sram_ecc, isa_xnack)) {
+    return false;
+  }
+
+  if (!agent_triple_target_id.empty()) return false;
+
+  // Check for compatibility
+  if (agent_isa_processor != co_processor) return false;
+  if (co_sram_ecc != ' ') {
+    if (co_sram_ecc != isa_sram_ecc) return false;
+  }
+  if (co_xnack != ' ') {
+    if (co_xnack != isa_xnack) return false;
+  }
+
+  return true;
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::ExtractCodeObjectFromFile(amd::Os::FileDesc fdesc, size_t fsize,
+                       const void ** image, const std::vector<std::string>& device_names,
+                       std::vector<std::pair<const void*, size_t>>& code_objs) {
+
+  hipError_t hip_error = hipSuccess;
+
+  if (fdesc < 0) {
+    return hipErrorFileNotFound;
+  }
+
+  // Map the file to memory, with offset 0.
+  //file will be unmapped in ModuleUnload
+  //const void* image = nullptr;
+  if (!amd::Os::MemoryMapFileDesc(fdesc, fsize, 0, image)) {
+    return hipErrorInvalidValue;
+  }
+
+  // retrieve code_objs{binary_image, binary_size} for devices
+  hip_error = extractCodeObjectFromFatBinary(*image, device_names, code_objs);
+
+  return hip_error;
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::ExtractCodeObjectFromMemory(const void* data,
+                       const std::vector<std::string>& device_names,
+                       std::vector<std::pair<const void*, size_t>>& code_objs,
+                       std::string& uri) {
+
+  // Get the URI from memory
+  if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
+    return hipErrorInvalidValue;
+  }
+
+  return extractCodeObjectFromFatBinary(data, device_names, code_objs);
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::extractCodeObjectFromFatBinary(const void* data,
+                       const std::vector<std::string>& agent_triple_target_ids,
+                       std::vector<std::pair<const void*, size_t>>& code_objs) {
+  std::string magic((const char*)data, bundle_magic_string_size);
+  if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) {
+    return hipErrorInvalidKernelFile;
+  }
+
+  // Initialize Code objects
+  code_objs.reserve(agent_triple_target_ids.size());
+  for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
+    code_objs.push_back(std::make_pair(nullptr, 0));
+  }
+
+  const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
+  const auto* desc = &obheader->desc[0];
+  size_t num_code_objs = code_objs.size();
+  for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
+                desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+                    reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
+                    desc->bundleEntryIdSize)) {
+    const void* image =
+        reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
+    const size_t image_size = desc->size;
+
+    if (num_code_objs == 0) break;
+    std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+
+    unsigned co_version = 0;
+    std::string co_triple_target_id;
+    if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id, co_version)) continue;
+
+    for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
+      if (code_objs[dev].first) continue;
+      if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
+        code_objs[dev] = std::make_pair(image, image_size);
+        --num_code_objs;
+      }
+    }
+  }
+  if (num_code_objs == 0) {
+    return hipSuccess;
+  } else {
+    LogPrintfError("%s",
+                   "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
+    LogPrintfError("%s", "  Devices:");
+    for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
+      LogPrintfError("    %s - [%s]", agent_triple_target_ids[i].c_str(),
+                     ((code_objs[i].first) ? "Found" : "Not Found"));
+    }
+    const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
+    const auto* desc = &obheader->desc[0];
+    LogPrintfError("%s", "  Bundled Code Objects:");
+    for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
+                  desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+                      reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
+                      desc->bundleEntryIdSize)) {
+      std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+      const void* image =
+          reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
+
+      unsigned co_version = 0;
+      std::string co_triple_target_id;
+      bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id, co_version);
+
+      if (valid_co) {
+        LogPrintfError("    %s - [code object v%u is %s]", bundleEntryId.c_str(), co_version,
+                       co_triple_target_id.c_str());
+      } else {
+        LogPrintfError("    %s - [Unsupported]", bundleEntryId.c_str());
+      }
+    }
+
+    guarantee(false, "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
+    return hipErrorNoBinaryForGpu;
+  }
+}
+
+hipError_t DynCO::loadCodeObject(const char* fname, const void* image) {
+
+  amd::ScopedLock lock(dclock_);
+
+  // Number of devices = 1 in dynamic code object
+  fb_info_ = new FatBinaryInfo(fname, image);
+  std::vector<hip::Device*> devices = { g_devices[ihipGetDevice()] };
+  IHIP_RETURN_ONFAIL(fb_info_->ExtractFatBinary(devices));
+
+  // No Lazy loading for DynCO
+  IHIP_RETURN_ONFAIL(fb_info_->BuildProgram(ihipGetDevice()));
+
+  // Define Global variables
+  IHIP_RETURN_ONFAIL(populateDynGlobalVars());
+
+  // Define Global functions
+  IHIP_RETURN_ONFAIL(populateDynGlobalFuncs());
+
+  return hipSuccess;
+}
+
+//Dynamic Code Object
+DynCO::~DynCO() {
+  amd::ScopedLock lock(dclock_);
+
+  for (auto& elem : vars_) {
+    delete elem.second;
+  }
+  vars_.clear();
+
+  for (auto& elem : functions_) {
+    delete elem.second;
+  }
+  functions_.clear();
+
+  delete fb_info_;
+}
+
+hipError_t DynCO::getDeviceVar(DeviceVar** dvar, std::string var_name) {
+  amd::ScopedLock lock(dclock_);
+
+  CheckDeviceIdMatch();
+
+  auto it = vars_.find(var_name);
+  if (it == vars_.end()) {
+    LogPrintfError("Cannot find the Var: %s ", var_name.c_str());
+    return hipErrorNotFound;
+  }
+
+  it->second->getDeviceVar(dvar, device_id_, module());
+  return hipSuccess;
+}
+
+hipError_t DynCO::getDynFunc(hipFunction_t* hfunc, std::string func_name) {
+  amd::ScopedLock lock(dclock_);
+
+  CheckDeviceIdMatch();
+
+  if(hfunc == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  auto it = functions_.find(func_name);
+  if (it == functions_.end()) {
+    LogPrintfError("Cannot find the function: %s ", func_name.c_str());
+    return hipErrorNotFound;
+  }
+
+  /* See if this could be solved */
+  return it->second->getDynFunc(hfunc, module());
+}
+
+hipError_t DynCO::populateDynGlobalVars() {
+  amd::ScopedLock lock(dclock_);
+
+  std::vector<std::string> var_names;
+  std::vector<std::string> undef_var_names;
+
+  //For Dynamic Modules there is only one hipFatBinaryDevInfo_
+  device::Program* dev_program
+    = fb_info_->GetProgram(ihipGetDevice())->getDeviceProgram
+                          (*hip::getCurrentDevice()->devices()[0]);
+
+  if (!dev_program->getGlobalVarFromCodeObj(&var_names)) {
+    LogPrintfError("Could not get Global vars from Code Obj for Module: 0x%x \n", module());
+    return hipErrorSharedObjectSymbolNotFound;
+  }
+
+  for (auto& elem : var_names) {
+    vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
+  }
+
+  return hipSuccess;
+}
+
+hipError_t DynCO::populateDynGlobalFuncs() {
+  amd::ScopedLock lock(dclock_);
+
+  std::vector<std::string> func_names;
+  device::Program* dev_program
+    = fb_info_->GetProgram(ihipGetDevice())->getDeviceProgram(
+                           *hip::getCurrentDevice()->devices()[0]);
+
+  // Get all the global func names from COMGR
+  if (!dev_program->getGlobalFuncFromCodeObj(&func_names)) {
+    LogPrintfError("Could not get Global Funcs from Code Obj for Module: 0x%x \n", module());
+    return hipErrorSharedObjectSymbolNotFound;
+  }
+
+  for (auto& elem : func_names) {
+    functions_.insert(std::make_pair(elem, new Function(elem)));
+  }
+
+  return hipSuccess;
+}
+
+//Static Code Object
+StatCO::StatCO() {
+}
+
+StatCO::~StatCO() {
+  amd::ScopedLock lock(sclock_);
+
+  for (auto& elem : functions_) {
+    delete elem.second;
+  }
+  functions_.clear();
+
+  for (auto& elem : vars_) {
+    delete elem.second;
+  }
+  vars_.clear();
+}
+
+hipError_t StatCO::digestFatBinary(const void* data, FatBinaryInfo*& programs) {
+  amd::ScopedLock lock(sclock_);
+
+  if (programs != nullptr) {
+    return hipSuccess;
+  }
+
+  // Create a new fat binary object and extract the fat binary for all devices.
+  programs = new FatBinaryInfo(nullptr, data);
+  IHIP_RETURN_ONFAIL(programs->ExtractFatBinary(g_devices));
+
+  return hipSuccess;
+}
+
+FatBinaryInfo** StatCO::addFatBinary(const void* data, bool initialized) {
+  amd::ScopedLock lock(sclock_);
+
+  if (initialized) {
+    digestFatBinary(data, modules_[data]);
+  }
+  return &modules_[data];
+}
+
+hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
+  amd::ScopedLock lock(sclock_);
+
+  auto vit = vars_.begin();
+  while (vit != vars_.end()) {
+    if (vit->second->moduleInfo() == module) {
+      delete vit->second;
+      vit = vars_.erase(vit);
+    } else {
+      ++vit;
+    }
+  }
+
+  auto it = managedVars_.begin();
+  while (it != managedVars_.end()) {
+    if ((*it)->moduleInfo() == module) {
+      delete *it;
+      managedVars_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto fit = functions_.begin();
+  while (fit != functions_.end()) {
+    if (fit->second->moduleInfo() == module) {
+      delete fit->second;
+      fit = functions_.erase(fit);
+    } else {
+      ++fit;
+    }
+  }
+
+  auto mit = modules_.begin();
+  while (mit != modules_.end()) {
+    if (&mit->second == module) {
+      delete mit->second;
+      mit = modules_.erase(mit);
+    } else {
+      ++mit;
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t StatCO::registerStatFunction(const void* hostFunction, Function* func) {
+  amd::ScopedLock lock(sclock_);
+
+  if (functions_.find(hostFunction) != functions_.end()) {
+    DevLogPrintfError("hostFunctionPtr: 0x%x already exists", hostFunction);
+  }
+  functions_.insert(std::make_pair(hostFunction, func));
+
+  return hipSuccess;
+}
+
+hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = functions_.find(hostFunction);
+  if (it == functions_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  return it->second->getStatFunc(hfunc, deviceId);
+}
+
+hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = functions_.find(hostFunction);
+  if (it == functions_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  return it->second->getStatFuncAttr(func_attr, deviceId);
+}
+
+hipError_t StatCO::registerStatGlobalVar(const void* hostVar, Var* var) {
+  amd::ScopedLock lock(sclock_);
+
+  if (vars_.find(hostVar) != vars_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  vars_.insert(std::make_pair(hostVar, var));
+  return hipSuccess;
+}
+
+hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                                    size_t* size_ptr) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = vars_.find(hostVar);
+  if (it == vars_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  DeviceVar* dvar = nullptr;
+  IHIP_RETURN_ONFAIL(it->second->getStatDeviceVar(&dvar, deviceId));
+
+  *dev_ptr = dvar->device_ptr();
+  *size_ptr = dvar->size();
+  return hipSuccess;
+}
+
+hipError_t StatCO::registerStatManagedVar(Var* var) {
+  managedVars_.emplace_back(var);
+  return hipSuccess;
+}
+
+hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
+  amd::ScopedLock lock(sclock_);
+
+  if (managedVarsDevicePtrInitalized_.find(deviceId) == managedVarsDevicePtrInitalized_.end() ||
+      !managedVarsDevicePtrInitalized_[deviceId]) {
+    for (auto var : managedVars_) {
+      DeviceVar* dvar = nullptr;
+      IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId));
+
+      amd::HostQueue* queue = hip::getNullStream();
+      if(queue != nullptr) {
+        ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), var->getManagedVarPtr(),
+                  dvar->size(), hipMemcpyHostToDevice, *queue);
+      } else {
+        ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
+        return hipErrorInvalidResourceHandle;
+      }
+    }
+    managedVarsDevicePtrInitalized_[deviceId] = true;
+  }
+  return hipSuccess;
+}
+}; //namespace: hip
diff --git a/rocclr/hip_code_object.hpp b/rocclr/hip_code_object.hpp
new file mode 100755
index 0000000000..6e406ad8da
--- /dev/null
+++ b/rocclr/hip_code_object.hpp
@@ -0,0 +1,156 @@
+/*
+Copyright (c) 2015-2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_CODE_OBJECT_HPP
+#define HIP_CODE_OBJECT_HPP
+
+#include "hip_global.hpp"
+
+#include <cstring>
+#include <unordered_map>
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include "hip_internal.hpp"
+#include "device/device.hpp"
+#include "platform/program.hpp"
+
+//Forward Declaration for friend usage
+class PlatformState;
+
+namespace hip {
+
+//Code Object base class
+class CodeObject {
+ public:
+  virtual ~CodeObject() {}
+
+  // Functions to add_dev_prog and build
+  static hipError_t add_program(int deviceId, hipModule_t hmod, const void* binary_ptr,
+                                size_t binary_size);
+  static hipError_t build_module(hipModule_t hmod, const std::vector<amd::Device*>& devices);
+
+  // Given an file desc and file size, extracts to code object for corresponding devices,
+  // return code_objs{binary_ptr, binary_size}, which could be used to determine foffset
+  static hipError_t ExtractCodeObjectFromFile(amd::Os::FileDesc fdesc, size_t fsize,
+                    const void ** image, const std::vector<std::string>& device_names,
+                    std::vector<std::pair<const void*, size_t>>& code_objs);
+
+  // Given an ptr to memory, extracts to code object for corresponding devices,
+  // returns code_objs{binary_ptr, binary_size} and uniform resource indicator
+  static hipError_t ExtractCodeObjectFromMemory(const void* data,
+                    const std::vector<std::string>& device_names,
+                    std::vector<std::pair<const void*, size_t>>& code_objs,
+                    std::string& uri);
+
+  static uint64_t ElfSize(const void* emi);
+
+protected:
+  //Given an ptr to image or file, extracts to code object
+  //for corresponding devices
+  static hipError_t extractCodeObjectFromFatBinary(const void*,
+                    const std::vector<std::string>&,
+                    std::vector<std::pair<const void*, size_t>>&);
+
+  CodeObject() {}
+private:
+  friend const std::vector<hipModule_t>& modules();
+};
+
+//Dynamic Code Object
+class DynCO : public CodeObject {
+  amd::Monitor dclock_{"Guards Dynamic Code object", true};
+
+public:
+  DynCO() : device_id_(ihipGetDevice()) {}
+  virtual ~DynCO();
+
+  //LoadsCodeObject and its data
+  hipError_t loadCodeObject(const char* fname, const void* image=nullptr);
+  hipModule_t module() { return fb_info_->Module(ihipGetDevice()); };
+
+  //Gets GlobalVar/Functions from a dynamically loaded code object
+  hipError_t getDynFunc(hipFunction_t* hfunc, std::string func_name);
+  hipError_t getDeviceVar(DeviceVar** dvar, std::string var_name);
+
+  // Device ID Check to check if module is launched in the same device it was loaded.
+  inline void CheckDeviceIdMatch() {
+    if (device_id_ != ihipGetDevice()) {
+      guarantee(false, "Device mismatch from where this module is loaded");
+    }
+  }
+
+private:
+  int device_id_;
+  FatBinaryInfo* fb_info_;
+
+  //Maps for vars/funcs, could be keyed in with std::string name
+  std::unordered_map<std::string, Function*> functions_;
+  std::unordered_map<std::string, Var*> vars_;
+
+  //Populate Global Vars/Funcs from an code object(@ module_load)
+  hipError_t populateDynGlobalFuncs();
+  hipError_t populateDynGlobalVars();
+};
+
+//Static Code Object
+class StatCO: public CodeObject {
+  amd::Monitor sclock_{"Guards Static Code object", true};
+public:
+  StatCO();
+  virtual ~StatCO();
+
+  //Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
+  FatBinaryInfo** addFatBinary(const void* data, bool initialized);
+  hipError_t removeFatBinary(FatBinaryInfo** module);
+  hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
+
+  //Register vars/funcs given to use from __hipRegister[Var/Func/ManagedVar]
+  hipError_t registerStatFunction(const void* hostFunction, Function* func);
+  hipError_t registerStatGlobalVar(const void* hostVar, Var* var);
+  hipError_t registerStatManagedVar(Var *var);
+
+  //Retrive Vars/Funcs for a given hostSidePtr(const void*), unless stated otherwise.
+  hipError_t getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId);
+  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
+  hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                              size_t* size_ptr);
+
+  //Managed variable is a defined symbol in code object
+  //pointer to the alocated managed memory has to be copied to the address of symbol
+  hipError_t initStatManagedVarDevicePtr(int deviceId);
+private:
+  friend class ::PlatformState;
+  //Populated during __hipRegisterFatBinary
+  std::unordered_map<const void*, FatBinaryInfo*> modules_;
+  //Populated during __hipRegisterFuncs
+  std::unordered_map<const void*, Function*> functions_;
+  //Populated during __hipRegisterVars
+  std::unordered_map<const void*, Var*> vars_;
+  //Populated during __hipRegisterManagedVar
+  std::vector<Var*> managedVars_;
+  std::unordered_map<int, bool> managedVarsDevicePtrInitalized_;
+};
+
+}; // namespace hip
+
+#endif /* HIP_CODE_OBJECT_HPP */
diff --git a/rocclr/hip_context.cpp b/rocclr/hip_context.cpp
new file mode 100755
index 0000000000..5b4bdb73ab
--- /dev/null
+++ b/rocclr/hip_context.cpp
@@ -0,0 +1,380 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "hip_platform.hpp"
+#include "platform/runtime.hpp"
+#include "utils/flags.hpp"
+#include "utils/versions.hpp"
+
+std::vector<hip::Device*> g_devices;
+
+namespace hip {
+
+thread_local Device* g_device = nullptr;
+thread_local std::stack<Device*> g_ctxtStack;
+thread_local hipError_t g_lastError = hipSuccess;
+std::once_flag g_ihipInitialized;
+Device* host_device = nullptr;
+
+void init() {
+  if (!amd::Runtime::initialized()) {
+    amd::IS_HIP = true;
+    GPU_NUM_MEM_DEPENDENCY = 0;
+    AMD_DIRECT_DISPATCH = flagIsDefault(AMD_DIRECT_DISPATCH) ? false : AMD_DIRECT_DISPATCH;
+    amd::Runtime::init();
+  }
+
+  const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
+
+  for (unsigned int i=0; i<devices.size(); i++) {
+    const std::vector<amd::Device*> device(1, devices[i]);
+    amd::Context* context = new amd::Context(device, amd::Context::Info());
+    if (!context) return;
+
+    // Enable active wait on the device by default
+    devices[i]->SetActiveWait(true);
+
+    if (context && CL_SUCCESS != context->create(nullptr)) {
+      context->release();
+    } else {
+      g_devices.push_back(new Device(context, i));
+    }
+  }
+
+  amd::Context* hContext = new amd::Context(devices, amd::Context::Info());
+  if (!hContext) return;
+
+  if (CL_SUCCESS != hContext->create(nullptr)) {
+    hContext->release();
+  }
+  host_device = new Device(hContext, -1);
+
+  PlatformState::instance().init();
+}
+
+Device* getCurrentDevice() {
+  return g_device;
+}
+
+void setCurrentDevice(unsigned int index) {
+  assert(index<g_devices.size());
+  g_device = g_devices[index];
+}
+
+amd::HostQueue* getQueue(hipStream_t stream) {
+ if (stream == nullptr) {
+    return getNullStream();
+  } else {
+    constexpr bool WaitNullStreamOnly = true;
+    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(stream)->asHostQueue();
+    if (!(reinterpret_cast<hip::Stream*>(stream)->Flags() & hipStreamNonBlocking)) {
+      iHipWaitActiveStreams(queue, WaitNullStreamOnly);
+    }
+    return queue;
+  }
+}
+
+// ================================================================================================
+amd::HostQueue* getNullStream(amd::Context& ctx) {
+  for (auto& it : g_devices) {
+    if (it->asContext() == &ctx) {
+      return it->NullStream();
+    }
+  }
+  // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device
+  // runtime selects by default
+  if (hip::host_device->asContext() == &ctx) {
+    // Return current...
+    return getNullStream();
+  }
+  return nullptr;
+}
+
+// ================================================================================================
+amd::HostQueue* getNullStream() {
+  Device* device = getCurrentDevice();
+  return device ? device->NullStream() : nullptr;
+}
+
+};
+
+using namespace hip;
+
+hipError_t hipInit(unsigned int flags) {
+  HIP_INIT_API(hipInit, flags);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device) {
+  HIP_INIT_API(hipCtxCreate, ctx, flags, device);
+
+  if (static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *ctx = reinterpret_cast<hipCtx_t>(g_devices[device]);
+
+  // Increment ref count for device primary context
+  g_devices[device]->retain();
+  g_ctxtStack.push(g_devices[device]);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxSetCurrent, ctx);
+
+  if (ctx == nullptr) {
+    if(!g_ctxtStack.empty()) {
+      g_ctxtStack.pop();
+    }
+  } else {
+    hip::g_device = reinterpret_cast<hip::Device*>(ctx);
+    if(!g_ctxtStack.empty()) {
+      g_ctxtStack.pop();
+    }
+    g_ctxtStack.push(hip::getCurrentDevice());
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+  HIP_INIT_API(hipCtxGetCurrent, ctx);
+
+  *ctx = reinterpret_cast<hipCtx_t>(hip::getCurrentDevice());
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+  HIP_INIT_API(hipCtxGetSharedMemConfig, pConfig);
+
+  *pConfig = hipSharedMemBankSizeFourByte;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipRuntimeGetVersion(int *runtimeVersion) {
+  HIP_INIT_API(hipRuntimeGetVersion, runtimeVersion);
+
+  if (!runtimeVersion) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // HIP_VERSION = HIP_VERSION_MAJOR*100 + HIP_MINOR_VERSION
+  *runtimeVersion = HIP_VERSION;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxDestroy(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxDestroy, ctx);
+
+  hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
+  if (dev == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Need to remove the ctx of calling thread if its the top one
+  if (!g_ctxtStack.empty() && g_ctxtStack.top() == dev) {
+    g_ctxtStack.pop();
+  }
+
+  // Remove context from global context list
+  for (unsigned int i = 0; i < g_devices.size(); i++) {
+    if (g_devices[i] == dev) {
+      // Decrement ref count for device primary context
+      dev->release();
+    }
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+  HIP_INIT_API(hipCtxPopCurrent, ctx);
+
+  hip::Device** dev = reinterpret_cast<hip::Device**>(ctx);
+  if (!g_ctxtStack.empty()) {
+    if (dev != nullptr) {
+      *dev = g_ctxtStack.top();
+    }
+    g_ctxtStack.pop();
+  } else {
+    DevLogError("Context Stack empty \n");
+    HIP_RETURN(hipErrorInvalidContext);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxPushCurrent, ctx);
+
+  hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
+  if (dev == nullptr) {
+    HIP_RETURN(hipErrorInvalidContext);
+  }
+
+  hip::g_device = dev;
+  g_ctxtStack.push(hip::getCurrentDevice());
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDriverGetVersion(int* driverVersion) {
+  HIP_INIT_API(hipDriverGetVersion, driverVersion);
+
+  auto* deviceHandle = g_devices[0]->devices()[0];
+  const auto& info = deviceHandle->info();
+
+  if (driverVersion) {
+    *driverVersion = AMD_PLATFORM_BUILD_NUMBER * 100 +
+                     AMD_PLATFORM_REVISION_NUMBER;
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetDevice(hipDevice_t* device) {
+  HIP_INIT_API(hipCtxGetDevice, device);
+
+  if (device != nullptr) {
+    *device = hip::getCurrentDevice()->deviceId();
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipErrorInvalidContext);
+}
+
+hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+  HIP_INIT_API(hipCtxGetApiVersion, apiVersion);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig) {
+  HIP_INIT_API(hipCtxGetCacheConfig, cacheConfig);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig) {
+  HIP_INIT_API(hipCtxSetCacheConfig, cacheConfig);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+  HIP_INIT_API(hipCtxSetSharedMemConfig, config);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSynchronize(void) {
+  HIP_INIT_API(hipCtxSynchronize, 1);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxGetFlags(unsigned int* flags) {
+  HIP_INIT_API(hipCtxGetFlags, flags);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active) {
+  HIP_INIT_API(hipDevicePrimaryCtxGetState, dev, flags, active);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (flags != nullptr) {
+    *flags = 0;
+  }
+
+  if (active != nullptr) {
+    *active = (g_devices[dev] == hip::getCurrentDevice())? 1 : 0;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxRelease, dev);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxRetain, pctx, dev);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+  if (pctx == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pctx = reinterpret_cast<hipCtx_t>(g_devices[dev]);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxReset, dev);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+  HIP_INIT_API(hipDevicePrimaryCtxSetFlags, dev, flags);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  } else {
+    HIP_RETURN(hipErrorContextAlreadyInUse);
+  }
+}
diff --git a/rocclr/hip_conversions.hpp b/rocclr/hip_conversions.hpp
new file mode 100644
index 0000000000..35948703ad
--- /dev/null
+++ b/rocclr/hip_conversions.hpp
@@ -0,0 +1,903 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/amd_detail/driver_types.h>
+#include <hip/amd_detail/texture_types.h>
+
+namespace hip
+{
+inline
+cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
+                                 const hipTextureReadMode hipReadMode) {
+  if (hipReadMode == hipReadModeElementType) {
+    switch (hipFormat) {
+      case HIP_AD_FORMAT_UNSIGNED_INT8:
+        return CL_UNSIGNED_INT8;
+      case HIP_AD_FORMAT_SIGNED_INT8:
+        return CL_SIGNED_INT8;
+      case HIP_AD_FORMAT_UNSIGNED_INT16:
+        return CL_UNSIGNED_INT16;
+      case HIP_AD_FORMAT_SIGNED_INT16:
+        return CL_SIGNED_INT16;
+      case HIP_AD_FORMAT_UNSIGNED_INT32:
+        return CL_UNSIGNED_INT32;
+      case HIP_AD_FORMAT_SIGNED_INT32:
+        return CL_SIGNED_INT32;
+      case HIP_AD_FORMAT_HALF:
+        return CL_HALF_FLOAT;
+      case HIP_AD_FORMAT_FLOAT:
+        return CL_FLOAT;
+    }
+  } else if (hipReadMode == hipReadModeNormalizedFloat) {
+    switch (hipFormat) {
+      case HIP_AD_FORMAT_UNSIGNED_INT8:
+        return CL_UNORM_INT8;
+      case HIP_AD_FORMAT_SIGNED_INT8:
+        return CL_SNORM_INT8;
+      case HIP_AD_FORMAT_UNSIGNED_INT16:
+        return CL_UNORM_INT16;
+      case HIP_AD_FORMAT_SIGNED_INT16:
+        return CL_SNORM_INT16;
+      case HIP_AD_FORMAT_UNSIGNED_INT32:
+        return CL_UNSIGNED_INT32;
+      case HIP_AD_FORMAT_SIGNED_INT32:
+        return CL_SIGNED_INT32;
+      case HIP_AD_FORMAT_HALF:
+        return CL_HALF_FLOAT;
+      case HIP_AD_FORMAT_FLOAT:
+        return CL_FLOAT;
+    }
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
+                                   const int sRGB) {
+  switch (hipNumChannels) {
+    case 1:
+      return CL_R;
+    case 2:
+      return CL_RG;
+    case 4:
+      return (sRGB == 1) ? CL_sRGBA : CL_RGBA;
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
+                                      const unsigned int hipHeight,
+                                      const unsigned int hipDepth,
+                                      const unsigned int flags) {
+  if (flags == hipArrayDefault) {
+    if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth == 0)) {
+      return CL_MEM_OBJECT_IMAGE1D;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth == 0)) {
+      return CL_MEM_OBJECT_IMAGE2D;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE3D;
+    }
+  } else if (flags == hipArrayLayered) {
+    if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE1D_ARRAY;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE2D_ARRAY;
+    }
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
+  switch (hipAddressMode) {
+    case hipAddressModeWrap:
+      return CL_ADDRESS_REPEAT;
+    case hipAddressModeClamp:
+      return CL_ADDRESS_CLAMP_TO_EDGE;
+    case hipAddressModeMirror:
+      return CL_ADDRESS_MIRRORED_REPEAT;
+    case hipAddressModeBorder:
+      return CL_ADDRESS_CLAMP;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
+  switch (hipFilterMode) {
+    case hipFilterModePoint:
+      return CL_FILTER_NEAREST;
+    case hipFilterModeLinear:
+      return CL_FILTER_LINEAR;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
+  switch (hipResType) {
+    case hipResourceTypeLinear:
+      return CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    case hipResourceTypePitch2D:
+      return CL_MEM_OBJECT_IMAGE2D;
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+size_t getElementSize(const hipArray_const_t array) {
+  switch (array->Format) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      return 1 * array->NumChannels;
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+    case HIP_AD_FORMAT_SIGNED_INT16:
+    case HIP_AD_FORMAT_HALF:
+      return 2 * array->NumChannels;
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+    case HIP_AD_FORMAT_SIGNED_INT32:
+    case HIP_AD_FORMAT_FLOAT:
+      return 4 * array->NumChannels;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+hipChannelFormatDesc getChannelFormatDesc(int numChannels,
+                                          hipArray_Format arrayFormat) {
+  switch (arrayFormat) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+      switch (numChannels) {
+        case 1:
+          return {8, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {8, 8, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {8, 8, 8, 8, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      switch (numChannels) {
+        case 1:
+          return {8, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {8, 8, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {8, 8, 8, 8, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT16:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT32:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_HALF:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindFloat};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindFloat};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindFloat};
+      }
+    case HIP_AD_FORMAT_FLOAT:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindFloat};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindFloat};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindFloat};
+      }
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
+  return ((desc.x != 0) + (desc.y != 0) + (desc.z != 0) + (desc.w != 0));
+}
+
+inline
+hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
+  switch (desc.f) {
+    case hipChannelFormatKindUnsigned:
+      switch (desc.x) {
+        case 8:
+          return HIP_AD_FORMAT_UNSIGNED_INT8;
+        case 16:
+          return HIP_AD_FORMAT_UNSIGNED_INT16;
+        case 32:
+          return HIP_AD_FORMAT_UNSIGNED_INT32;
+      }
+    case hipChannelFormatKindSigned:
+      switch (desc.x) {
+        case 8:
+          return HIP_AD_FORMAT_SIGNED_INT8;
+        case 16:
+          return HIP_AD_FORMAT_SIGNED_INT16;
+        case 32:
+          return HIP_AD_FORMAT_SIGNED_INT32;
+      }
+    case hipChannelFormatKindFloat:
+      switch (desc.x) {
+        case 16:
+          return HIP_AD_FORMAT_HALF;
+        case 32:
+          return HIP_AD_FORMAT_FLOAT;
+      }
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+int getNumChannels(const hipResourceViewFormat hipFormat) {
+  switch (hipFormat) {
+    case hipResViewFormatUnsignedChar1:
+    case hipResViewFormatSignedChar1:
+    case hipResViewFormatUnsignedShort1:
+    case hipResViewFormatSignedShort1:
+    case hipResViewFormatUnsignedInt1:
+    case hipResViewFormatSignedInt1:
+    case hipResViewFormatHalf1:
+    case hipResViewFormatFloat1:
+      return 1;
+    case hipResViewFormatUnsignedChar2:
+    case hipResViewFormatSignedChar2:
+    case hipResViewFormatUnsignedShort2:
+    case hipResViewFormatSignedShort2:
+    case hipResViewFormatUnsignedInt2:
+    case hipResViewFormatSignedInt2:
+    case hipResViewFormatHalf2:
+    case hipResViewFormatFloat2:
+      return 2;
+    case hipResViewFormatUnsignedChar4:
+    case hipResViewFormatSignedChar4:
+    case hipResViewFormatUnsignedShort4:
+    case hipResViewFormatSignedShort4:
+    case hipResViewFormatUnsignedInt4:
+    case hipResViewFormatSignedInt4:
+    case hipResViewFormatHalf4:
+    case hipResViewFormatFloat4:
+      return 4;
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
+  switch (hipFormat) {
+    case hipResViewFormatUnsignedChar1:
+    case hipResViewFormatUnsignedChar2:
+    case hipResViewFormatUnsignedChar4:
+      return HIP_AD_FORMAT_UNSIGNED_INT8;
+    case hipResViewFormatSignedChar1:
+    case hipResViewFormatSignedChar2:
+    case hipResViewFormatSignedChar4:
+      return HIP_AD_FORMAT_SIGNED_INT8;
+    case hipResViewFormatUnsignedShort1:
+    case hipResViewFormatUnsignedShort2:
+    case hipResViewFormatUnsignedShort4:
+      return HIP_AD_FORMAT_UNSIGNED_INT16;
+    case hipResViewFormatSignedShort1:
+    case hipResViewFormatSignedShort2:
+    case hipResViewFormatSignedShort4:
+      return HIP_AD_FORMAT_SIGNED_INT16;
+    case hipResViewFormatUnsignedInt1:
+    case hipResViewFormatUnsignedInt2:
+    case hipResViewFormatUnsignedInt4:
+      return HIP_AD_FORMAT_UNSIGNED_INT32;
+    case hipResViewFormatSignedInt1:
+    case hipResViewFormatSignedInt2:
+    case hipResViewFormatSignedInt4:
+      return HIP_AD_FORMAT_SIGNED_INT32;
+    case hipResViewFormatHalf1:
+    case hipResViewFormatHalf2:
+    case hipResViewFormatHalf4:
+      return HIP_AD_FORMAT_HALF;
+    case hipResViewFormatFloat1:
+    case hipResViewFormatFloat2:
+    case hipResViewFormatFloat4:
+      return HIP_AD_FORMAT_FLOAT;
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
+  switch (desc.f) {
+    case hipChannelFormatKindUnsigned:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar1;
+            case 16:
+              return hipResViewFormatUnsignedShort1;
+            case 32:
+              return hipResViewFormatUnsignedInt1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar2;
+            case 16:
+              return hipResViewFormatUnsignedShort2;
+            case 32:
+              return hipResViewFormatUnsignedInt2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar4;
+            case 16:
+              return hipResViewFormatUnsignedShort4;
+            case 32:
+              return hipResViewFormatUnsignedInt4;
+          }
+      }
+    case hipChannelFormatKindSigned:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar1;
+            case 16:
+              return hipResViewFormatSignedShort1;
+            case 32:
+              return hipResViewFormatSignedInt1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar2;
+            case 16:
+              return hipResViewFormatSignedShort2;
+            case 32:
+              return hipResViewFormatSignedInt2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar4;
+            case 16:
+              return hipResViewFormatSignedShort4;
+            case 32:
+              return hipResViewFormatSignedInt4;
+          }
+      }
+    case hipChannelFormatKindFloat:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf1;
+            case 32:
+              return hipResViewFormatFloat1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf2;
+            case 32:
+              return hipResViewFormatFloat2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf4;
+            case 32:
+              return hipResViewFormatFloat4;
+          }
+      }
+    default:
+      break;
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+hipTextureDesc getTextureDesc(const textureReference* texRef) {
+  hipTextureDesc texDesc = {};
+  std::memcpy(texDesc.addressMode, texRef->addressMode, sizeof(texDesc.addressMode));
+  texDesc.filterMode = texRef->filterMode;
+  texDesc.readMode = texRef->readMode;
+  texDesc.sRGB = texRef->sRGB;
+  texDesc.normalizedCoords = texRef->normalized;
+  texDesc.maxAnisotropy = texRef->maxAnisotropy;
+  texDesc.mipmapFilterMode = texRef->mipmapFilterMode;
+  texDesc.mipmapLevelBias = texRef->mipmapLevelBias;
+  texDesc.minMipmapLevelClamp = texRef->minMipmapLevelClamp;
+  texDesc.maxMipmapLevelClamp = texRef->maxMipmapLevelClamp;
+
+  return texDesc;
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
+                                        const hipResourceViewFormat format) {
+  hipResourceViewDesc resViewDesc = {};
+  resViewDesc.format = format;
+  resViewDesc.width = array->width;
+  resViewDesc.height = array->height;
+  resViewDesc.depth = array->depth;
+  resViewDesc.firstMipmapLevel = 0;
+  resViewDesc.lastMipmapLevel = 0;
+  resViewDesc.firstLayer = 0;
+  resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
+
+  return resViewDesc;
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
+                                        const hipResourceViewFormat format) {
+  hipResourceViewDesc resViewDesc = {};
+  resViewDesc.format = format;
+  resViewDesc.width = array->width;
+  resViewDesc.height = array->height;
+  resViewDesc.depth = array->depth;
+  resViewDesc.firstMipmapLevel = 0;
+  resViewDesc.lastMipmapLevel = 0; /* TODO add hipMipmappedArray::numMipLevels */
+  resViewDesc.firstLayer = 0;
+  resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
+
+  return resViewDesc;
+}
+
+inline
+std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
+  switch (kind) {
+    case hipMemcpyHostToHost:
+      return {hipMemoryTypeHost, hipMemoryTypeHost};
+    case hipMemcpyHostToDevice:
+      return {hipMemoryTypeHost, hipMemoryTypeDevice};
+    case hipMemcpyDeviceToHost:
+      return {hipMemoryTypeDevice, hipMemoryTypeHost};
+    case hipMemcpyDeviceToDevice:
+      return {hipMemoryTypeDevice, hipMemoryTypeDevice};
+    case hipMemcpyDefault:
+      return {hipMemoryTypeUnified, hipMemoryTypeUnified};
+  }
+
+  ShouldNotReachHere();
+
+  return {};
+}
+
+inline
+HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
+  HIP_MEMCPY3D desc3D = {};
+
+  desc3D.srcXInBytes = desc2D.srcXInBytes;
+  desc3D.srcY = desc2D.srcY;
+  desc3D.srcZ = 0;
+  desc3D.srcLOD = 0;
+  desc3D.srcMemoryType = desc2D.srcMemoryType;
+  desc3D.srcHost = desc2D.srcHost;
+  desc3D.srcDevice = desc2D.srcDevice;
+  desc3D.srcArray = desc2D.srcArray;
+  desc3D.srcPitch = desc2D.srcPitch;
+  desc3D.srcHeight = 0;
+
+  desc3D.dstXInBytes = desc2D.dstXInBytes;
+  desc3D.dstY = desc2D.dstY;
+  desc3D.dstZ = 0;
+  desc3D.dstLOD = 0;
+  desc3D.dstMemoryType = desc2D.dstMemoryType;
+  desc3D.dstHost = desc2D.dstHost;
+  desc3D.dstDevice = desc2D.dstDevice;
+  desc3D.dstArray = desc2D.dstArray;
+  desc3D.dstPitch = desc2D.dstPitch;
+  desc3D.dstHeight = 0;
+
+  desc3D.WidthInBytes = desc2D.WidthInBytes;
+  desc3D.Height = desc2D.Height;
+  desc3D.Depth = 1;
+
+  return desc3D;
+}
+
+inline
+HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
+  HIP_MEMCPY3D descDrv = {};
+
+  descDrv.WidthInBytes = desc.extent.width;
+  descDrv.Height = desc.extent.height;
+  descDrv.Depth = desc.extent.depth;
+
+  descDrv.srcXInBytes = desc.srcPos.x;
+  descDrv.srcY = desc.srcPos.y;
+  descDrv.srcZ = desc.srcPos.z;
+  descDrv.srcLOD = 0;
+
+  descDrv.dstXInBytes = desc.dstPos.x;
+  descDrv.dstY = desc.dstPos.y;
+  descDrv.dstZ = desc.dstPos.z;
+  descDrv.dstLOD = 0;
+
+  if (desc.srcArray != nullptr) {
+    descDrv.srcMemoryType = hipMemoryTypeArray;
+    descDrv.srcArray = desc.srcArray;
+    // When reffering to array memory, hipPos::x is in elements.
+    descDrv.srcXInBytes *= getElementSize(desc.srcArray);
+  }
+
+  if (desc.srcPtr.ptr != nullptr) {
+    descDrv.srcMemoryType = std::get<0>(hip::getMemoryType(desc.kind));
+    descDrv.srcHost = desc.srcPtr.ptr;
+    descDrv.srcDevice = desc.srcPtr.ptr;
+    descDrv.srcPitch = desc.srcPtr.pitch;
+    descDrv.srcHeight = desc.srcPtr.ysize;
+  }
+
+  if (desc.dstArray != nullptr) {
+    descDrv.dstMemoryType = hipMemoryTypeArray;
+    descDrv.dstArray = desc.dstArray;
+    // When reffering to array memory, hipPos::x is in elements.
+    descDrv.dstXInBytes *= getElementSize(desc.dstArray);
+  }
+
+  if (desc.dstPtr.ptr != nullptr) {
+    descDrv.dstMemoryType = std::get<1>(getMemoryType(desc.kind));
+    descDrv.dstHost = desc.dstPtr.ptr;
+    descDrv.dstDevice = desc.dstPtr.ptr;
+    descDrv.dstPitch = desc.dstPtr.pitch;
+    descDrv.dstHeight = desc.dstPtr.ysize;
+  }
+
+  // If a HIP array is participating in the copy, the extent is defined in terms of that array's elements.
+  if ((desc.srcArray != nullptr) && (desc.dstArray == nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.srcArray);
+  } else if ((desc.srcArray == nullptr) && (desc.dstArray != nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.dstArray);
+  } else if ((desc.srcArray != nullptr) && (desc.dstArray != nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.dstArray);
+  }
+
+  return descDrv;
+}
+
+inline
+hipResourceType getResourceType(const HIPresourcetype resType) {
+  // These two enums should be isomorphic.
+  return static_cast<hipResourceType>(resType);
+}
+
+inline
+HIPresourcetype getResourceType(const hipResourceType resType) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPresourcetype>(resType);
+}
+
+inline
+hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
+  hipResourceDesc desc;
+
+  desc.resType = getResourceType(resDesc.resType);
+  switch (desc.resType) {
+  case hipResourceTypeArray:
+    desc.res.array.array = resDesc.res.array.hArray;
+    break;
+  case hipResourceTypeMipmappedArray:
+    desc.res.mipmap.mipmap = resDesc.res.mipmap.hMipmappedArray;
+    break;
+  case hipResourceTypeLinear:
+    desc.res.linear.devPtr = resDesc.res.linear.devPtr;
+    desc.res.linear.desc = getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
+    desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
+    break;
+  case hipResourceTypePitch2D:
+    desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
+    desc.res.pitch2D.desc = getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
+    desc.res.pitch2D.width = resDesc.res.pitch2D.width;
+    desc.res.pitch2D.height = resDesc.res.pitch2D.height;
+    desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
+    break;
+  default:
+    break;
+  }
+
+  return desc;
+}
+
+inline
+HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
+  HIP_RESOURCE_DESC desc;
+
+  desc.resType = getResourceType(resDesc.resType);
+  switch (desc.resType) {
+  case HIP_RESOURCE_TYPE_ARRAY:
+    desc.res.array.hArray = resDesc.res.array.array;
+    break;
+  case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+    desc.res.mipmap.hMipmappedArray = resDesc.res.mipmap.mipmap;
+    break;
+  case HIP_RESOURCE_TYPE_LINEAR:
+    desc.res.linear.devPtr = resDesc.res.linear.devPtr;
+    desc.res.linear.numChannels = getNumChannels(resDesc.res.linear.desc);
+    desc.res.linear.format = getArrayFormat(resDesc.res.linear.desc);
+    desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
+    break;
+  case HIP_RESOURCE_TYPE_PITCH2D:
+    desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
+    desc.res.pitch2D.numChannels = getNumChannels(resDesc.res.pitch2D.desc);
+    desc.res.pitch2D.format = getArrayFormat(resDesc.res.pitch2D.desc);
+    desc.res.pitch2D.width = resDesc.res.pitch2D.width;
+    desc.res.pitch2D.height = resDesc.res.pitch2D.height;
+    desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
+    break;
+  default:
+    break;
+  }
+
+  return desc;
+}
+
+inline
+hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<hipTextureAddressMode>(mode);
+}
+
+inline
+HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPaddress_mode>(mode);
+}
+
+inline
+hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<hipTextureFilterMode>(mode);
+}
+
+inline
+HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPfilter_mode>(mode);
+}
+
+inline
+hipTextureReadMode getReadMode(const unsigned int flags) {
+  if (flags & HIP_TRSF_READ_AS_INTEGER) {
+    return hipReadModeElementType;
+  } else {
+    return hipReadModeNormalizedFloat;
+  }
+}
+
+inline
+unsigned int getReadMode(const hipTextureReadMode mode) {
+  if (mode ==  hipReadModeElementType) {
+    return HIP_TRSF_READ_AS_INTEGER;
+  } else {
+    return 0;
+  }
+}
+
+inline
+int getsRGB(const unsigned int flags) {
+  if (flags & HIP_TRSF_SRGB) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+inline
+unsigned int getsRGB(const int sRGB) {
+  if (sRGB == 1) {
+    return HIP_TRSF_SRGB;
+  } else {
+    return 0;
+  }
+}
+
+inline
+int getNormalizedCoords(const unsigned int flags) {
+  if (flags & HIP_TRSF_NORMALIZED_COORDINATES) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+inline
+unsigned int getNormalizedCoords(const int normalizedCoords) {
+  if (normalizedCoords == 1) {
+    return HIP_TRSF_NORMALIZED_COORDINATES;
+  } else {
+    return 0;
+  }
+}
+
+inline
+hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
+  hipTextureDesc desc;
+
+  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
+  desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
+  desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
+  desc.filterMode = getFilterMode(texDesc.filterMode);
+  desc.readMode = getReadMode(texDesc.flags);
+  desc.sRGB = getsRGB(texDesc.flags);
+  std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
+  desc.normalizedCoords = getNormalizedCoords(texDesc.flags);
+  desc.maxAnisotropy = texDesc.maxAnisotropy;
+  desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
+  desc.mipmapLevelBias = texDesc.mipmapLevelBias;
+  desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
+  desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
+
+  return desc;
+}
+
+inline
+HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
+  HIP_TEXTURE_DESC desc;
+
+  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
+  desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
+  desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
+  desc.filterMode = getFilterMode(texDesc.filterMode);
+  desc.flags = 0;
+  desc.flags |= getReadMode(texDesc.readMode);
+  desc.flags |= getsRGB(texDesc.sRGB);
+  desc.flags |= getNormalizedCoords(texDesc.normalizedCoords);
+  desc.maxAnisotropy = texDesc.maxAnisotropy;
+  desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
+  desc.mipmapLevelBias = texDesc.mipmapLevelBias;
+  desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
+  desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
+  std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
+
+  return desc;
+}
+
+inline
+hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
+  // These two enums should be isomorphic.
+  return static_cast<hipResourceViewFormat>(format);
+}
+
+inline
+HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPresourceViewFormat>(format);
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
+  hipResourceViewDesc desc;
+
+  desc.format = getResourceViewFormat(resViewDesc.format);
+  desc.width = resViewDesc.width;
+  desc.height = resViewDesc.height;
+  desc.depth = resViewDesc.depth;
+  desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
+  desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
+  desc.firstLayer = resViewDesc.firstLayer;
+  desc.lastLayer = resViewDesc.lastLayer;
+
+  return desc;
+}
+
+inline
+HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
+  HIP_RESOURCE_VIEW_DESC desc;
+
+  desc.format = getResourceViewFormat(resViewDesc.format);
+  desc.width = resViewDesc.width;
+  desc.height = resViewDesc.height;
+  desc.depth = resViewDesc.depth;
+  desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
+  desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
+  desc.firstLayer = resViewDesc.firstLayer;
+  desc.lastLayer = resViewDesc.lastLayer;
+
+  return desc;
+}
+
+inline
+size_t getElementSize(const hipChannelFormatDesc &desc) {
+  return (desc.x / 8) * getNumChannels(desc);
+}
+};
diff --git a/rocclr/hip_device.cpp b/rocclr/hip_device.cpp
new file mode 100644
index 0000000000..3488bc21bd
--- /dev/null
+++ b/rocclr/hip_device.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2018-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+namespace hip {
+
+// ================================================================================================
+amd::HostQueue* Device::NullStream(bool skip_alloc) {
+  amd::HostQueue* null_queue = null_stream_.asHostQueue(skip_alloc);
+  if (null_queue == nullptr) {
+    return nullptr;
+  }
+  // Wait for all active streams before executing commands on the default
+  iHipWaitActiveStreams(null_queue);
+  return null_queue;
+}
+
+}
+
+hipError_t hipDeviceGet(hipDevice_t *device, int deviceId) {
+  HIP_INIT_API(hipDeviceGet, device, deviceId);
+
+  if (deviceId < 0 ||
+      static_cast<size_t>(deviceId) >= g_devices.size() ||
+      device == nullptr) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+  *device = deviceId;
+  HIP_RETURN(hipSuccess);
+};
+
+hipError_t hipDeviceTotalMem (size_t *bytes, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceTotalMem, bytes, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (bytes == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& info = deviceHandle->info();
+
+  *bytes = info.globalMemSize_;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceComputeCapability, major, minor, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (major == nullptr || minor == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& isa = deviceHandle->isa();
+  *major = isa.versionMajor();
+  *minor = isa.versionMinor();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetCount(int* count) {
+  HIP_INIT_API(hipDeviceGetCount, count);
+
+  HIP_RETURN(ihipDeviceGetCount(count));
+}
+
+hipError_t ihipDeviceGetCount(int* count) {
+  if (count == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  // Get all available devices
+  *count = g_devices.size();
+
+  if (*count < 1) {
+    return hipErrorNoDevice;
+  }
+
+  return hipSuccess;
+}
+
+hipError_t hipDeviceGetName(char *name, int len, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceGetName, (void*)name, len, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (name == nullptr || len <= 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& info = deviceHandle->info();
+  const auto nameLen = ::strlen(info.boardName_);
+
+  // Make sure that the size of `dest` is big enough to hold `src` including
+  // trailing zero byte
+  if (nameLen > (cl_uint)(len - 1)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  ::strncpy(name, info.boardName_, (nameLen + 1));
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) {
+  HIP_INIT_API(hipGetDeviceProperties, props, device);
+
+  if (props == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (unsigned(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+  auto* deviceHandle = g_devices[device]->devices()[0];
+
+  hipDeviceProp_t deviceProps = {0};
+
+  const auto& info = deviceHandle->info();
+  const auto& isa = deviceHandle->isa();
+  ::strncpy(deviceProps.name, info.boardName_, 128);
+  deviceProps.totalGlobalMem = info.globalMemSize_;
+  deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
+  deviceProps.regsPerBlock = info.availableRegistersPerCU_;
+  deviceProps.warpSize = info.wavefrontWidth_;
+  deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
+  deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
+  deviceProps.maxThreadsDim[1] = info.maxWorkItemSizes_[1];
+  deviceProps.maxThreadsDim[2] = info.maxWorkItemSizes_[2];
+  deviceProps.maxGridSize[0] = INT32_MAX;
+  deviceProps.maxGridSize[1] = INT32_MAX;
+  deviceProps.maxGridSize[2] = INT32_MAX;
+  deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000;
+  deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000;
+  deviceProps.memoryBusWidth = info.globalMemChannels_;
+  deviceProps.totalConstMem = info.maxConstantBufferSize_;
+  deviceProps.major = isa.versionMajor();
+  deviceProps.minor = isa.versionMinor();
+  deviceProps.multiProcessorCount = info.maxComputeUnits_;
+  deviceProps.l2CacheSize = info.l2CacheSize_;
+  deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_;
+  deviceProps.computeMode = 0;
+  deviceProps.clockInstructionRate = info.timeStampFrequency_;
+  deviceProps.arch.hasGlobalInt32Atomics       = 1;
+  deviceProps.arch.hasGlobalFloatAtomicExch    = 1;
+  deviceProps.arch.hasSharedInt32Atomics       = 1;
+  deviceProps.arch.hasSharedFloatAtomicExch    = 1;
+  deviceProps.arch.hasFloatAtomicAdd           = 1;
+  deviceProps.arch.hasGlobalInt64Atomics       = 1;
+  deviceProps.arch.hasSharedInt64Atomics       = 1;
+  deviceProps.arch.hasDoubles                  = 1;
+  deviceProps.arch.hasWarpVote                 = 1;
+  deviceProps.arch.hasWarpBallot               = 1;
+  deviceProps.arch.hasWarpShuffle              = 1;
+  deviceProps.arch.hasFunnelShift              = 0;
+  deviceProps.arch.hasThreadFenceSystem        = 1;
+  deviceProps.arch.hasSyncThreadsExt           = 0;
+  deviceProps.arch.hasSurfaceFuncs             = 0;
+  deviceProps.arch.has3dGrid                   = 1;
+  deviceProps.arch.hasDynamicParallelism       = 0;
+  deviceProps.concurrentKernels = 1;
+  deviceProps.pciDomainID = info.pciDomainID;
+  deviceProps.pciBusID = info.deviceTopology_.pcie.bus;
+  deviceProps.pciDeviceID = info.deviceTopology_.pcie.device;
+  deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_;
+  deviceProps.canMapHostMemory = 1;
+  //FIXME: This should be removed, targets can have character names as well.
+  deviceProps.gcnArch = isa.versionMajor() * 100 + isa.versionMinor() * 10 + isa.versionStepping();
+  sprintf(deviceProps.gcnArchName, "%s", isa.targetId());
+  deviceProps.cooperativeLaunch = info.cooperativeGroups_;
+  deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
+
+  deviceProps.cooperativeMultiDeviceUnmatchedFunc = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedGridDim = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_;
+
+  deviceProps.maxTexture1DLinear = 16 * info.imageMaxBufferSize_; // Max pixel size is 16 bytes
+  deviceProps.maxTexture1D = info.image1DMaxWidth_;
+  deviceProps.maxTexture2D[0] = info.image2DMaxWidth_;
+  deviceProps.maxTexture2D[1] = info.image2DMaxHeight_;
+  deviceProps.maxTexture3D[0] = info.image3DMaxWidth_;
+  deviceProps.maxTexture3D[1] = info.image3DMaxHeight_;
+  deviceProps.maxTexture3D[2] = info.image3DMaxDepth_;
+  deviceProps.hdpMemFlushCntl = info.hdpMemFlushCntl;
+  deviceProps.hdpRegFlushCntl = info.hdpRegFlushCntl;
+
+  deviceProps.memPitch = info.maxMemAllocSize_;
+  deviceProps.textureAlignment = info.imageBaseAddressAlignment_;
+  deviceProps.texturePitchAlignment = info.imagePitchAlignment_;
+  deviceProps.kernelExecTimeoutEnabled = 0;
+  deviceProps.ECCEnabled = info.errorCorrectionSupport_? 1:0;
+  deviceProps.isLargeBar = info.largeBar_ ? 1 : 0;
+  deviceProps.asicRevision = info.asicRevision_;
+
+  // HMM capabilities
+  deviceProps.managedMemory = info.hmmSupported_;
+  deviceProps.concurrentManagedAccess =  info.hmmSupported_;
+  deviceProps.directManagedMemAccessFromHost = info.hmmDirectHostAccess_;
+  deviceProps.pageableMemoryAccess = info.hmmCpuMemoryAccessible_;
+  deviceProps.pageableMemoryAccessUsesHostPageTables = info.hostUnifiedMemory_;
+
+  *props = deviceProps;
+  HIP_RETURN(hipSuccess);
+}
diff --git a/rocclr/hip_device_runtime.cpp b/rocclr/hip_device_runtime.cpp
new file mode 100755
index 0000000000..560821c033
--- /dev/null
+++ b/rocclr/hip_device_runtime.cpp
@@ -0,0 +1,563 @@
+/* Copyright (c) 2018-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) {
+
+  HIP_INIT_API(hipChooseDevice, device, properties);
+
+  if (device == nullptr || properties == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *device = 0;
+  cl_uint maxMatchedCount = 0;
+  int count = 0;
+  ihipDeviceGetCount(&count);
+
+  for (cl_int i = 0; i< count; ++i) {
+    hipDeviceProp_t currentProp = {0};
+    cl_uint validPropCount = 0;
+    cl_uint matchedCount = 0;
+    hipError_t err = hipGetDeviceProperties(&currentProp, i);
+    if (properties->major != 0) {
+      validPropCount++;
+      if(currentProp.major >= properties->major) {
+        matchedCount++;
+      }
+    }
+    if (properties->minor != 0) {
+      validPropCount++;
+      if(currentProp.minor >= properties->minor) {
+        matchedCount++;
+      }
+    }
+    if(properties->totalGlobalMem != 0) {
+        validPropCount++;
+        if(currentProp.totalGlobalMem >= properties->totalGlobalMem) {
+            matchedCount++;
+        }
+    }
+    if(properties->sharedMemPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.sharedMemPerBlock >= properties->sharedMemPerBlock) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxThreadsPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.maxThreadsPerBlock >= properties->maxThreadsPerBlock ) {
+            matchedCount++;
+        }
+    }
+    if(properties->totalConstMem != 0) {
+        validPropCount++;
+        if(currentProp.totalConstMem >= properties->totalConstMem ) {
+            matchedCount++;
+        }
+    }
+    if(properties->multiProcessorCount != 0) {
+        validPropCount++;
+        if(currentProp.multiProcessorCount >=
+          properties->multiProcessorCount ) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxThreadsPerMultiProcessor != 0) {
+        validPropCount++;
+        if(currentProp.maxThreadsPerMultiProcessor >=
+          properties->maxThreadsPerMultiProcessor ) {
+            matchedCount++;
+        }
+    }
+    if(properties->memoryClockRate != 0) {
+        validPropCount++;
+        if(currentProp.memoryClockRate >= properties->memoryClockRate ) {
+            matchedCount++;
+        }
+    }
+    if(properties->memoryBusWidth != 0) {
+        validPropCount++;
+        if(currentProp.memoryBusWidth >= properties->memoryBusWidth ) {
+            matchedCount++;
+        }
+    }
+    if(properties->l2CacheSize != 0) {
+        validPropCount++;
+        if(currentProp.l2CacheSize >= properties->l2CacheSize ) {
+            matchedCount++;
+        }
+    }
+    if(properties->regsPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.regsPerBlock >= properties->regsPerBlock ) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxSharedMemoryPerMultiProcessor != 0) {
+        validPropCount++;
+        if(currentProp.maxSharedMemoryPerMultiProcessor >=
+          properties->maxSharedMemoryPerMultiProcessor ) {
+            matchedCount++;
+        }
+    }
+    if(properties->warpSize != 0) {
+        validPropCount++;
+        if(currentProp.warpSize >= properties->warpSize ) {
+            matchedCount++;
+        }
+    }
+    if(validPropCount == matchedCount) {
+      *device = matchedCount > maxMatchedCount ? i : *device;
+      maxMatchedCount = std::max(matchedCount, maxMatchedCount);
+    }
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+
+  HIP_INIT_API(hipDeviceGetAttribute, pi, attr, device);
+
+  if (pi == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  int count = 0;
+  ihipDeviceGetCount(&count);
+  if (device < 0 || device >= count) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  //FIXME: should we cache the props, or just select from deviceHandle->info_?
+  hipDeviceProp_t prop = {0};
+  hipError_t err = hipGetDeviceProperties(&prop, device);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+
+  switch (attr) {
+  case hipDeviceAttributeMaxThreadsPerBlock:
+    *pi = prop.maxThreadsPerBlock;
+    break;
+  case hipDeviceAttributeMaxBlockDimX:
+    *pi = prop.maxThreadsDim[0];
+    break;
+  case hipDeviceAttributeMaxBlockDimY:
+    *pi = prop.maxThreadsDim[1];
+    break;
+  case hipDeviceAttributeMaxBlockDimZ:
+    *pi = prop.maxThreadsDim[2];
+    break;
+  case hipDeviceAttributeMaxGridDimX:
+    *pi = prop.maxGridSize[0];
+    break;
+  case hipDeviceAttributeMaxGridDimY:
+    *pi = prop.maxGridSize[1];
+    break;
+  case hipDeviceAttributeMaxGridDimZ:
+    *pi = prop.maxGridSize[2];
+    break;
+  case hipDeviceAttributeMaxSharedMemoryPerBlock:
+    *pi = prop.sharedMemPerBlock;
+    break;
+  case hipDeviceAttributeTotalConstantMemory:
+    *pi = prop.totalConstMem;
+    break;
+  case hipDeviceAttributeWarpSize:
+    *pi = prop.warpSize;
+    break;
+  case hipDeviceAttributeMaxRegistersPerBlock:
+    *pi = prop.regsPerBlock;
+    break;
+  case hipDeviceAttributeClockRate:
+    *pi = prop.clockRate;
+    break;
+  case hipDeviceAttributeMemoryClockRate:
+    *pi = prop.memoryClockRate;
+    break;
+  case hipDeviceAttributeMemoryBusWidth:
+    *pi = prop.memoryBusWidth;
+    break;
+  case hipDeviceAttributeMultiprocessorCount:
+    *pi = prop.multiProcessorCount;
+    break;
+  case hipDeviceAttributeComputeMode:
+    *pi = prop.computeMode;
+    break;
+  case hipDeviceAttributeL2CacheSize:
+    *pi = prop.l2CacheSize;
+    break;
+  case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+    *pi = prop.maxThreadsPerMultiProcessor;
+    break;
+  case hipDeviceAttributeComputeCapabilityMajor:
+    *pi = prop.major;
+    break;
+  case hipDeviceAttributeComputeCapabilityMinor:
+    *pi = prop.minor;
+    break;
+  case hipDeviceAttributePciBusId:
+    *pi = prop.pciBusID;
+    break;
+  case hipDeviceAttributeConcurrentKernels:
+    *pi = prop.concurrentKernels;
+    break;
+  case hipDeviceAttributePciDeviceId:
+    *pi = prop.pciDeviceID;
+    break;
+  case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+    *pi = prop.maxSharedMemoryPerMultiProcessor;
+    break;
+  case hipDeviceAttributeIsMultiGpuBoard:
+    *pi = prop.isMultiGpuBoard;
+    break;
+  case hipDeviceAttributeCooperativeLaunch:
+    *pi = prop.cooperativeLaunch;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+    *pi = prop.cooperativeMultiDeviceLaunch;
+    break;
+  case hipDeviceAttributeIntegrated:
+    *pi = prop.integrated;
+    break;
+  case hipDeviceAttributeMaxTexture1DWidth:
+    *pi = prop.maxTexture1D;
+    break;
+  case hipDeviceAttributeMaxTexture2DWidth:
+    *pi = prop.maxTexture2D[0];
+    break;
+  case hipDeviceAttributeMaxTexture2DHeight:
+    *pi = prop.maxTexture2D[1];
+    break;
+  case hipDeviceAttributeMaxTexture3DWidth:
+    *pi = prop.maxTexture3D[0];
+    break;
+  case hipDeviceAttributeMaxTexture3DHeight:
+    *pi = prop.maxTexture3D[1];
+    break;
+  case hipDeviceAttributeMaxTexture3DDepth:
+    *pi = prop.maxTexture3D[2];
+    break;
+  case hipDeviceAttributeHdpMemFlushCntl:
+    *reinterpret_cast<unsigned int**>(pi) = prop.hdpMemFlushCntl;
+    break;
+  case hipDeviceAttributeHdpRegFlushCntl:
+    *reinterpret_cast<unsigned int**>(pi) = prop.hdpRegFlushCntl;
+    break;
+  case hipDeviceAttributeMaxPitch:
+    *pi = prop.memPitch;
+    break;
+  case hipDeviceAttributeTextureAlignment:
+    *pi = prop.textureAlignment;
+    break;
+  case hipDeviceAttributeTexturePitchAlignment:
+    *pi = prop.texturePitchAlignment;
+    break;
+  case hipDeviceAttributeKernelExecTimeout:
+    *pi = prop.kernelExecTimeoutEnabled;
+    break;
+  case hipDeviceAttributeCanMapHostMemory:
+    *pi = prop.canMapHostMemory;
+    break;
+  case hipDeviceAttributeEccEnabled:
+    *pi = prop.ECCEnabled;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
+    *pi = prop.cooperativeMultiDeviceUnmatchedFunc;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
+    *pi = prop.cooperativeMultiDeviceUnmatchedGridDim;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
+    *pi = prop.cooperativeMultiDeviceUnmatchedBlockDim;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
+    *pi = prop.cooperativeMultiDeviceUnmatchedSharedMem;
+    break;
+  case hipDeviceAttributeAsicRevision:
+    *pi = prop.asicRevision;
+    break;
+  case hipDeviceAttributeManagedMemory:
+    *pi = prop.managedMemory;
+    break;
+  case hipDeviceAttributeDirectManagedMemAccessFromHost:
+    *pi = prop.directManagedMemAccessFromHost;
+    break;
+  case hipDeviceAttributeConcurrentManagedAccess:
+    *pi = prop.concurrentManagedAccess;
+    break;
+  case hipDeviceAttributePageableMemoryAccess:
+    *pi = prop.pageableMemoryAccess;
+    break;
+  case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
+    *pi = prop.pageableMemoryAccessUsesHostPageTables;
+    break;
+  case hipDeviceAttributeCanUseStreamWaitValue:
+    // hipStreamWaitValue64() and hipStreamWaitValue32() support
+    *pi = g_devices[device]->devices()[0]->info().aqlBarrierValue_;
+    break;
+  default:
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) {
+
+  HIP_INIT_API(hipDeviceGetByPCIBusId, device, pciBusIdstr);
+
+  if (device == nullptr || pciBusIdstr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  int pciBusID = -1;
+  int pciDeviceID = -1;
+  int pciDomainID = -1;
+  bool found = false;
+  if (sscanf (pciBusIdstr, "%04x:%02x:%02x", &pciDomainID, &pciBusID, &pciDeviceID) == 0x3) {
+    int count = 0;
+    ihipDeviceGetCount(&count);
+    for (cl_int i = 0; i < count; i++) {
+      hipDevice_t dev;
+      hipDeviceGet(&dev, i);
+      hipDeviceProp_t prop;
+      hipGetDeviceProperties(&prop, dev);
+
+      if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID)
+                    && (pciDeviceID == prop.pciDeviceID)) {
+        *device = i;
+        found = true;
+        break;
+      }
+    }
+  }
+  if (!found) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetCacheConfig ( hipFuncCache_t * cacheConfig ) {
+  HIP_INIT_API(hipDeviceGetCacheConfig, cacheConfig);
+
+  if(cacheConfig == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *cacheConfig = hipFuncCache_t();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) {
+
+  HIP_INIT_API(hipDeviceGetLimit, pValue, limit);
+
+  if(pValue == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  if(limit == hipLimitMallocHeapSize) {
+    hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, ihipGetDevice());
+
+    *pValue = prop.totalGlobalMem;
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(hipErrorUnsupportedLimit);
+  }
+}
+
+hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int  len, int  device ) {
+
+  HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device);
+
+  int count;
+  ihipDeviceGetCount(&count);
+  if (device < 0 || device >= count) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (pciBusId == nullptr || len <= 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceProp_t prop;
+  hipGetDeviceProperties(&prop, device);
+
+  snprintf (pciBusId, len, "%04x:%02x:%02x.0",
+                    prop.pciDomainID,
+                    prop.pciBusID,
+                    prop.pciDeviceID);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) {
+  HIP_INIT_API(hipDeviceGetSharedMemConfig, pConfig);
+  if (pConfig == nullptr) {
+    return HIP_RETURN(hipErrorInvalidValue);
+  }
+  *pConfig = hipSharedMemBankSizeFourByte;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceReset ( void ) {
+  HIP_INIT_API(hipDeviceReset);
+
+  /* FIXME */
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceSetCacheConfig ( hipFuncCache_t cacheConfig ) {
+  HIP_INIT_API(hipDeviceSetCacheConfig, cacheConfig);
+
+  // No way to set cache config yet.
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceSetLimit ( hipLimit_t limit, size_t value ) {
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) {
+  HIP_INIT_API(hipDeviceSetSharedMemConfig, config);
+
+  // No way to set cache config yet.
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceSynchronize ( void ) {
+  HIP_INIT_API(hipDeviceSynchronize);
+
+  amd::HostQueue* queue = hip::getNullStream();
+
+  if (!queue) {
+    HIP_RETURN(hipErrorOutOfMemory);
+  }
+
+  queue->finish();
+
+  hip::Stream::syncNonBlockingStreams();
+
+  HIP_RETURN(hipSuccess);
+}
+
+int ihipGetDevice() {
+  hip::Device* device = hip::getCurrentDevice();
+  if(device == nullptr){
+    return -1;
+  }
+  return device->deviceId();
+}
+
+hipError_t hipGetDevice ( int* deviceId ) {
+  HIP_INIT_API(hipGetDevice, deviceId);
+
+  if (deviceId != nullptr) {
+    int dev = ihipGetDevice();
+    if (dev == -1) {
+      HIP_RETURN(hipErrorNoDevice);
+    }
+    *deviceId = dev;
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+}
+
+hipError_t hipGetDeviceCount ( int* count ) {
+  HIP_INIT_API(hipGetDeviceCount, count);
+
+  HIP_RETURN(ihipDeviceGetCount(count));
+}
+
+hipError_t hipGetDeviceFlags ( unsigned int* flags ) {
+  HIP_INIT_API(hipGetDeviceFlags, flags);
+  if (flags == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  *flags = hip::getCurrentDevice()->getFlags();
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipSetDevice ( int  device ) {
+  HIP_INIT_API(hipSetDevice, device);
+
+  if (static_cast<unsigned int>(device) < g_devices.size()) {
+    hip::setCurrentDevice(device);
+
+    HIP_RETURN(hipSuccess);
+  }
+  HIP_RETURN(hipErrorInvalidDevice);
+}
+
+hipError_t hipSetDeviceFlags ( unsigned int  flags ) {
+  HIP_INIT_API(hipSetDeviceFlags, flags);
+
+  constexpr uint32_t supportedFlags =
+      hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
+
+  if (flags & ~supportedFlags) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  switch (flags & hipDeviceScheduleMask) {
+    case hipDeviceScheduleAuto:
+      // Current behavior is different from the spec, due to MT usage in runtime
+      if (hip::host_device->devices().size() >= std::thread::hardware_concurrency()) {
+        device->SetActiveWait(false);
+        break;
+      }
+      // Fall through for active wait...
+    case hipDeviceScheduleSpin:
+    case hipDeviceScheduleYield:
+      // The both options falls into yield, because MT usage in runtime
+      device->SetActiveWait(true);
+      break;
+    case hipDeviceScheduleBlockingSync:
+      device->SetActiveWait(false);
+      break;
+    default:
+      break;
+  }
+  hip::getCurrentDevice()->setFlags(flags & hipDeviceScheduleMask);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipSetValidDevices ( int* device_arr, int  len ) {
+  HIP_INIT_API(hipSetValidDevices, device_arr, len);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
diff --git a/rocclr/hip_error.cpp b/rocclr/hip_error.cpp
new file mode 100644
index 0000000000..7ecedff8d4
--- /dev/null
+++ b/rocclr/hip_error.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipGetLastError()
+{
+  HIP_INIT_API(hipGetLastError);
+  hipError_t err = hip::g_lastError;
+  hip::g_lastError = hipSuccess;
+  return err;
+}
+
+hipError_t hipPeekAtLastError()
+{
+  HIP_INIT_API(hipPeekAtLastError);
+  hipError_t err = hip::g_lastError;
+  HIP_RETURN(err);
+}
+
+const char *hipGetErrorName(hipError_t hip_error)
+{
+  switch (hip_error) {
+    case hipSuccess:
+        return "hipSuccess";
+    case hipErrorInvalidValue:
+        return "hipErrorInvalidValue";
+    case hipErrorOutOfMemory:
+        return "hipErrorOutOfMemory";
+    case hipErrorNotInitialized:
+        return "hipErrorNotInitialized";
+    case hipErrorDeinitialized:
+        return "hipErrorDeinitialized";
+    case hipErrorProfilerDisabled:
+        return "hipErrorProfilerDisabled";
+    case hipErrorProfilerNotInitialized:
+        return "hipErrorProfilerNotInitialized";
+    case hipErrorProfilerAlreadyStarted:
+        return "hipErrorProfilerAlreadyStarted";
+    case hipErrorProfilerAlreadyStopped:
+        return "hipErrorProfilerAlreadyStopped";
+    case hipErrorInvalidConfiguration:
+        return "hipErrorInvalidConfiguration";
+    case hipErrorInvalidSymbol:
+        return "hipErrorInvalidSymbol";
+    case hipErrorInvalidDevicePointer:
+        return "hipErrorInvalidDevicePointer";
+    case hipErrorInvalidMemcpyDirection:
+        return "hipErrorInvalidMemcpyDirection";
+    case hipErrorInsufficientDriver:
+        return "hipErrorInsufficientDriver";
+    case hipErrorMissingConfiguration:
+        return "hipErrorMissingConfiguration";
+    case hipErrorPriorLaunchFailure:
+        return "hipErrorPriorLaunchFailure";
+    case hipErrorInvalidDeviceFunction:
+        return "hipErrorInvalidDeviceFunction";
+    case hipErrorNoDevice:
+        return "hipErrorNoDevice";
+    case hipErrorInvalidDevice:
+        return "hipErrorInvalidDevice";
+    case hipErrorInvalidPitchValue:
+        return "hipErrorInvalidPitchValue";
+    case hipErrorInvalidImage:
+        return "hipErrorInvalidImage";
+    case hipErrorInvalidContext:
+        return "hipErrorInvalidContext";
+    case hipErrorContextAlreadyCurrent:
+        return "hipErrorContextAlreadyCurrent";
+    case hipErrorMapFailed:
+        return "hipErrorMapFailed";
+    case hipErrorUnmapFailed:
+        return "hipErrorUnmapFailed";
+    case hipErrorArrayIsMapped:
+        return "hipErrorArrayIsMapped";
+    case hipErrorAlreadyMapped:
+        return "hipErrorAlreadyMapped";
+    case hipErrorNoBinaryForGpu:
+        return "hipErrorNoBinaryForGpu";
+    case hipErrorAlreadyAcquired:
+        return "hipErrorAlreadyAcquired";
+    case hipErrorNotMapped:
+        return "hipErrorNotMapped";
+    case hipErrorNotMappedAsArray:
+        return "hipErrorNotMappedAsArray";
+    case hipErrorNotMappedAsPointer:
+        return "hipErrorNotMappedAsPointer";
+    case hipErrorECCNotCorrectable:
+        return "hipErrorECCNotCorrectable";
+    case hipErrorUnsupportedLimit:
+        return "hipErrorUnsupportedLimit";
+    case hipErrorContextAlreadyInUse:
+        return "hipErrorContextAlreadyInUse";
+    case hipErrorPeerAccessUnsupported:
+        return "hipErrorPeerAccessUnsupported";
+    case hipErrorInvalidKernelFile:
+        return "hipErrorInvalidKernelFile";
+    case hipErrorInvalidGraphicsContext:
+        return "hipErrorInvalidGraphicsContext";
+    case hipErrorInvalidSource:
+        return "hipErrorInvalidSource";
+    case hipErrorFileNotFound:
+        return "hipErrorFileNotFound";
+    case hipErrorSharedObjectSymbolNotFound:
+        return "hipErrorSharedObjectSymbolNotFound";
+    case hipErrorSharedObjectInitFailed:
+        return "hipErrorSharedObjectInitFailed";
+    case hipErrorOperatingSystem:
+        return "hipErrorOperatingSystem";
+    case hipErrorInvalidHandle:
+        return "hipErrorInvalidHandle";
+    case hipErrorNotFound:
+        return "hipErrorNotFound";
+    case hipErrorNotReady:
+        return "hipErrorNotReady";
+    case hipErrorIllegalAddress:
+        return "hipErrorIllegalAddress";
+    case hipErrorLaunchOutOfResources:
+        return "hipErrorLaunchOutOfResources";
+    case hipErrorLaunchTimeOut:
+        return "hipErrorLaunchTimeOut";
+    case hipErrorPeerAccessAlreadyEnabled:
+        return "hipErrorPeerAccessAlreadyEnabled";
+    case hipErrorPeerAccessNotEnabled:
+        return "hipErrorPeerAccessNotEnabled";
+    case hipErrorSetOnActiveProcess:
+        return "hipErrorSetOnActiveProcess";
+    case hipErrorAssert:
+        return "hipErrorAssert";
+    case hipErrorHostMemoryAlreadyRegistered:
+        return "hipErrorHostMemoryAlreadyRegistered";
+    case hipErrorHostMemoryNotRegistered:
+        return "hipErrorHostMemoryNotRegistered";
+    case hipErrorLaunchFailure:
+        return "hipErrorLaunchFailure";
+    case hipErrorNotSupported:
+        return "hipErrorNotSupported";
+    case hipErrorUnknown:
+        return "hipErrorUnknown";
+    case hipErrorRuntimeMemory:
+        return "hipErrorRuntimeMemory";
+    case hipErrorRuntimeOther:
+        return "hipErrorRuntimeOther";
+    case hipErrorCooperativeLaunchTooLarge:
+        return "hipErrorCooperativeLaunchTooLarge";
+    case hipErrorTbd:
+        return "hipErrorTbd";
+    default:
+        return "hipErrorUnknown";
+    };
+}
+
+const char *hipGetErrorString(hipError_t hip_error)
+{
+  return hipGetErrorName(hip_error);
+}
+
diff --git a/rocclr/hip_event.cpp b/rocclr/hip_event.cpp
new file mode 100755
index 0000000000..05143bbda2
--- /dev/null
+++ b/rocclr/hip_event.cpp
@@ -0,0 +1,452 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_event.hpp"
+
+void ipcEventCallback(hipStream_t stream, hipError_t status, void* user_data)
+{
+  std::atomic<int> *signal = reinterpret_cast<std::atomic<int>*>(user_data);
+  signal->store(0);
+  return;
+}
+
+namespace hip {
+
+bool Event::ready() {
+  if (event_->status() != CL_COMPLETE) {
+    event_->notifyCmdQueue();
+  }
+
+  return (event_->status() == CL_COMPLETE);
+}
+
+hipError_t Event::query() {
+  amd::ScopedLock lock(lock_);
+
+  // If event is not recorded, event_ is null, hence return hipSuccess
+  if (event_ == nullptr) {
+    return hipSuccess;
+  }
+
+  return ready() ? hipSuccess : hipErrorNotReady;
+}
+
+hipError_t Event::synchronize() {
+  amd::ScopedLock lock(lock_);
+
+  // If event is not recorded, event_ is null, hence return hipSuccess
+  if (event_ == nullptr) {
+    return hipSuccess;
+  }
+
+  event_->awaitCompletion();
+
+  return hipSuccess;
+}
+
+hipError_t Event::elapsedTime(Event& eStop, float& ms) {
+  amd::ScopedLock startLock(lock_);
+
+  if (this == &eStop) {
+    if (event_ == nullptr) {
+      return hipErrorInvalidHandle;
+    }
+
+    if (flags & hipEventDisableTiming) {
+      return hipErrorInvalidHandle;
+    }
+
+    if (!ready()) {
+      return hipErrorNotReady;
+    }
+
+    ms = 0.f;
+    return hipSuccess;
+  }
+  amd::ScopedLock stopLock(eStop.lock_);
+
+  if (event_ == nullptr ||
+      eStop.event_  == nullptr) {
+    return hipErrorInvalidHandle;
+  }
+
+  if ((flags | eStop.flags) & hipEventDisableTiming) {
+    return hipErrorInvalidHandle;
+  }
+
+  if (!ready() || !eStop.ready()) {
+    return hipErrorNotReady;
+  }
+
+  if (event_ == eStop.event_ && recorded_ && eStop.recorded_) {
+    // Events are the same, which indicates the stream is empty and likely
+    // eventRecord is called on another stream. For such cases insert and measure a
+    // marker.
+    amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
+    command->enqueue();
+    command->awaitCompletion();
+    ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) - time())/1000000.f;
+    command->release();
+  } else {
+    ms = static_cast<float>(eStop.time() - time())/1000000.f;
+  }
+  return hipSuccess;
+}
+
+int64_t Event::time() const {
+  assert(event_ != nullptr);
+  if (recorded_) {
+    return static_cast<int64_t>(event_->profilingInfo().end_);
+  } else {
+    return static_cast<int64_t>(event_->profilingInfo().start_);
+  }
+}
+
+hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) {
+  if ((event_ == nullptr) || (event_->command().queue() == hostQueue)) {
+    return hipSuccess;
+  }
+
+  amd::ScopedLock lock(lock_);
+  bool retain = false;
+
+  if (!event_->notifyCmdQueue()) {
+    return hipErrorLaunchOutOfResources;
+  }
+  amd::Command::EventWaitList eventWaitList;
+  eventWaitList.push_back(event_);
+
+  amd::Command* command = new amd::Marker(*hostQueue, kMarkerDisableFlush, eventWaitList);
+  if (command == NULL) {
+    return hipErrorOutOfMemory;
+  }
+  command->enqueue();
+  command->release();
+
+  return hipSuccess;
+}
+
+void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) {
+  // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
+  amd::ScopedLock lock(lock_);
+
+  if (command == nullptr) {
+    command = queue->getLastQueuedCommand(true);
+
+    bool cmdNullOrMarker = (command == nullptr) || (command->type() == 0);
+
+    // If lastQueuedCommand is user invisible command(command->type() == 0),
+    // Always submit a marker if queue profiling is not explicitly enabled else
+    // submit a normal marker. Disable queue flush to batch commands
+    if (!queue->properties().test(CL_QUEUE_PROFILING_ENABLE) &&
+        !(flags & hipEventDisableTiming)) {
+      if (command != nullptr) {
+        command->release();
+      }
+      command = new hip::ProfileMarker(*queue, cmdNullOrMarker, true);
+      command->enqueue();
+    } else if (cmdNullOrMarker) {
+      if (command != nullptr) {
+        command->release();
+      }
+      command = new amd::Marker(*queue, kMarkerDisableFlush);
+      command->enqueue();
+    }
+  }
+
+  if (event_ == &command->event()) return;
+
+  if (event_ != nullptr) {
+    event_->release();
+  }
+
+  event_ = &command->event();
+  // Notify queue earlier so SW status for the command can be updated faster,
+  // since marker potentially means a wait
+  if (AMD_DIRECT_DISPATCH && (flags & hipEventDisableTiming)) {
+    command->notifyCmdQueue();
+  }
+  recorded_ = record;
+}
+
+}
+
+hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
+  if (event == nullptr) {
+    return hipErrorInvalidValue;
+  }
+#if !defined(_MSC_VER)
+  unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
+                            hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess;
+#else
+  unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
+                            hipEventReleaseToDevice | hipEventReleaseToSystem;
+#endif
+  const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem);
+
+  const bool illegalFlags =
+      (flags & ~supportedFlags) ||             // can't set any unsupported flags.
+      (flags & releaseFlags) == releaseFlags;  // can't set both release flags
+
+  if (!illegalFlags) {
+    hip::Event* e = new hip::Event(flags);
+    if (e == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    *event = reinterpret_cast<hipEvent_t>(e);
+  } else {
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipEventQuery(hipEvent_t event) {
+  if (event == nullptr) {
+    return hipErrorInvalidHandle;
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if ((e->flags & hipEventInterprocess) && (e->ipc_evt_.ipc_shmem_)) {
+    int prev_read_idx = e->ipc_evt_.ipc_shmem_->read_index;
+    int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
+    if (e->ipc_evt_.ipc_shmem_->read_index < prev_read_idx+IPC_SIGNALS_PER_EVENT && e->ipc_evt_.ipc_shmem_->signal[offset] != 0) {
+      return hipErrorNotReady;
+    } 
+    return hipSuccess;
+  } else {
+    return e->query();
+  }
+}
+
+hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
+  HIP_INIT_API(hipEventCreateWithFlags, event, flags);
+  HIP_RETURN(ihipEventCreateWithFlags(event, flags), *event);
+}
+
+hipError_t hipEventCreate(hipEvent_t* event) {
+  HIP_INIT_API(hipEventCreate, event);
+  HIP_RETURN(ihipEventCreateWithFlags(event, 0), *event);
+}
+
+hipError_t hipEventDestroy(hipEvent_t event) {
+  HIP_INIT_API(hipEventDestroy, event);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if ((e->flags & hipEventInterprocess) && (e->ipc_evt_.ipc_shmem_)) {
+    int owners = -- e->ipc_evt_.ipc_shmem_->owners;
+    // Make sure event is synchronized
+    hipEventSynchronize(event);
+    if (!amd::Os::MemoryUnmapFile(e->ipc_evt_.ipc_shmem_,sizeof(hip::ihipIpcEventShmem_t))) {
+      HIP_RETURN(hipErrorInvalidHandle);
+    }
+  }
+  delete e;
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) {
+  HIP_INIT_API(hipEventElapsedTime, ms, start, stop);
+
+  if (start == nullptr || stop == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  if (ms == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hip::Event* eStart = reinterpret_cast<hip::Event*>(start);
+  hip::Event* eStop  = reinterpret_cast<hip::Event*>(stop);
+
+  if (eStart->deviceId() != eStop->deviceId()) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  HIP_RETURN(eStart->elapsedTime(*eStop, *ms), "Elapsed Time = ", *ms);
+}
+
+// ================================================================================================
+bool createIpcEventShmemIfNeeded(hip::Event::ihipIpcEvent_t& ipc_evt) {
+#if !defined(_MSC_VER)
+  if (ipc_evt.ipc_shmem_) {
+    // ipc_shmem_ already created, no need to create it again
+    return true;
+  }
+  char name_template[] = "/tmp/eventXXXXXX";
+  int temp_fd = mkstemp(name_template);
+
+  ipc_evt.ipc_name_ = name_template;
+  ipc_evt.ipc_name_.replace(0, 5, "/hip_");
+  if (!amd::Os::MemoryMapFileTruncated(ipc_evt.ipc_name_.c_str(),
+      const_cast<const void**> (reinterpret_cast<void**>(&(ipc_evt.ipc_shmem_))), sizeof(hip::ihipIpcEventShmem_t))) {
+    return false;
+  }
+  ipc_evt.ipc_shmem_->owners = 1;
+  ipc_evt.ipc_shmem_->read_index = -1;
+  ipc_evt.ipc_shmem_->write_index = 0;
+  for (uint32_t sig_idx = 0; sig_idx < IPC_SIGNALS_PER_EVENT; ++sig_idx) {
+    ipc_evt.ipc_shmem_->signal[sig_idx] = 0;
+  }
+
+  close(temp_fd);
+  return true;
+#else
+  return false;
+#endif
+}
+
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
+  HIP_INIT_API(hipEventRecord, event, stream);
+
+  STREAM_CAPTURE(hipEventRecord, stream, event);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+
+  if (g_devices[e->deviceId()]->devices()[0] != &queue->device()) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  bool isRecorded = e->isRecorded();
+  if ((e->flags & hipEventInterprocess) && !isRecorded) {
+    amd::Command* command = new amd::Marker(*queue, kMarkerDisableFlush);
+    amd::Event& tEvent = command->event();
+    createIpcEventShmemIfNeeded(e->ipc_evt_);
+    int write_index = e->ipc_evt_.ipc_shmem_->write_index++;
+    int offset = write_index % IPC_SIGNALS_PER_EVENT;
+    while (e->ipc_evt_.ipc_shmem_->signal[offset] != 0) {
+      amd::Os::sleep(1);
+    }
+    // Lock signal.
+    e->ipc_evt_.ipc_shmem_->signal[offset] = 1;
+    e->ipc_evt_.ipc_shmem_->owners_device_id = e->deviceId();
+
+    std::atomic<int> *signal = &e->ipc_evt_.ipc_shmem_->signal[offset];
+    StreamCallback* cbo = new StreamCallback(stream,
+                    reinterpret_cast<hipStreamCallback_t> (ipcEventCallback), signal, command);
+    if (!tEvent.setCallback(CL_COMPLETE, ihipStreamCallback,cbo)) {
+      command->release();
+      return hipErrorInvalidHandle;
+    }
+    command->enqueue();
+    tEvent.notifyCmdQueue();
+    // Update read index to indicate new signal.
+    int expected = write_index - 1;
+    while (!e->ipc_evt_.ipc_shmem_->read_index.compare_exchange_weak(expected, write_index)) {
+      amd::Os::sleep(1);
+    }
+  } else {
+    e->addMarker(queue, nullptr, true);
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipEventSynchronize(hipEvent_t event) {
+  HIP_INIT_API(hipEventSynchronize, event);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if ((e->flags & hipEventInterprocess) && (e->ipc_evt_.ipc_shmem_)) {
+    int prev_read_idx = e->ipc_evt_.ipc_shmem_->read_index;
+    if (prev_read_idx >= 0) {
+      int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
+      while ((e->ipc_evt_.ipc_shmem_->read_index < prev_read_idx + IPC_SIGNALS_PER_EVENT)
+               && (e->ipc_evt_.ipc_shmem_->signal[offset] != 0)) {
+        amd::Os::sleep(1);
+      }
+    }
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(e->synchronize());
+  }
+}
+
+hipError_t hipEventQuery(hipEvent_t event) {
+  HIP_INIT_API(hipEventQuery, event);
+  HIP_RETURN(ihipEventQuery(event));
+}
+
+hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+  HIP_INIT_API(hipIpcGetEventHandle, handle, event);
+#if !defined(_MSC_VER)
+  if (handle == nullptr || event == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if (!(e->flags & hipEventInterprocess)) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  if (!createIpcEventShmemIfNeeded(e->ipc_evt_)) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  ihipIpcEventHandle_t* iHandle = reinterpret_cast<ihipIpcEventHandle_t*>(handle);
+  memset(iHandle->shmem_name, 0, HIP_IPC_HANDLE_SIZE);
+  e->ipc_evt_.ipc_name_.copy(iHandle->shmem_name, std::string::npos);
+  HIP_RETURN(hipSuccess);
+#else
+  assert(0 && "Unimplemented");
+  HIP_RETURN(hipErrorNotSupported);
+#endif
+}
+
+hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+  HIP_INIT_API(NONE, event, handle);
+#if !defined(_MSC_VER)
+  hipError_t hip_err = hipSuccess;
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip_err = ihipEventCreateWithFlags(event, hipEventDisableTiming | hipEventInterprocess);
+  if (hip_err != hipSuccess) {
+    HIP_RETURN(hip_err);
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(*event);
+  ihipIpcEventHandle_t* iHandle = reinterpret_cast<ihipIpcEventHandle_t*>(&handle);
+  hip::Event::ihipIpcEvent_t& ipc_evt = e->ipc_evt_;
+  ipc_evt.ipc_name_ = iHandle->shmem_name;
+  if (!amd::Os::MemoryMapFileTruncated(ipc_evt.ipc_name_.c_str(),
+                    (const void**) &(ipc_evt.ipc_shmem_), sizeof(hip::ihipIpcEventShmem_t))) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  ipc_evt.ipc_shmem_->owners += 1;
+  e->setDeviceId(ipc_evt.ipc_shmem_->owners_device_id.load());
+
+  HIP_RETURN(hipSuccess);
+#else
+  assert(0 && "Unimplemented");
+  HIP_RETURN(hipErrorNotSupported);
+#endif
+}
diff --git a/rocclr/hip_event.hpp b/rocclr/hip_event.hpp
new file mode 100644
index 0000000000..94b71c3fcc
--- /dev/null
+++ b/rocclr/hip_event.hpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef HIP_EVENT_H
+#define HIP_EVENT_H
+
+#include "hip_internal.hpp"
+#include "thread/monitor.hpp"
+
+// Internal structure for stream callback handler
+class StreamCallback {
+   public:
+    StreamCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+                  amd::Command* command)
+        : stream_(stream), callBack_(callback),
+          userData_(userData), command_(command) {
+        };
+    hipStream_t stream_;
+    hipStreamCallback_t callBack_;
+    void* userData_;
+    amd::Command* command_;
+};
+
+void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status, void* user_data);
+
+
+namespace hip {
+
+class ProfileMarker: public amd::Marker {
+public:
+  ProfileMarker(amd::HostQueue& queue, bool disableFlush, bool markerTs = false)
+  : amd::Marker(queue, disableFlush) {
+    profilingInfo_.enabled_ = true;
+    profilingInfo_.callback_ = nullptr;
+    profilingInfo_.marker_ts_ = markerTs;
+    profilingInfo_.clear();
+  }
+};
+
+#define IPC_SIGNALS_PER_EVENT 32
+typedef struct ihipIpcEventShmem_s {
+  std::atomic<int> owners;
+  std::atomic<int> owners_device_id;
+  std::atomic<int> read_index;
+  std::atomic<int> write_index;
+  std::atomic<int> signal[IPC_SIGNALS_PER_EVENT];
+} ihipIpcEventShmem_t;
+
+class Event {
+  /// event recorded on stream where capture is active
+  bool onCapture_;
+  /// capture stream where event is recorded
+  hipStream_t captureStream_;
+  /// Previous captured nodes before event record
+  std::vector<hipGraphNode_t> nodesPrevToRecorded_;
+
+ public:
+  Event(unsigned int flags) : flags(flags), lock_("hipEvent_t", true),
+                              event_(nullptr), recorded_(false) {
+    // No need to init event_ here as addMarker does that
+    onCapture_ = false;
+    device_id_ = hip::getCurrentDevice()->deviceId(); // Created in current device ctx
+  }
+
+  ~Event() {
+    if (event_ != nullptr) {
+      event_->release();
+    }
+  }
+  unsigned int flags;
+
+  hipError_t query();
+  hipError_t synchronize();
+  hipError_t elapsedTime(Event& stop, float& ms);
+  hipError_t streamWait(amd::HostQueue* queue, uint flags);
+
+  void addMarker(amd::HostQueue* queue, amd::Command* command, bool record);
+  bool isRecorded() { return recorded_; }
+  amd::Monitor& lock() { return lock_; }
+  const int deviceId() { return device_id_; }
+  void setDeviceId(int id) { device_id_ = id; }
+
+  /// End capture on this event
+  void EndCapture() {
+    onCapture_ = false;
+    captureStream_ = nullptr;
+  }
+  /// Start capture when waited on this event
+  void StartCapture(hipStream_t stream) {
+    onCapture_ = true;
+    captureStream_ = stream;
+  }
+  /// Get capture status of the graph
+  bool GetCaptureStatus() { return onCapture_; }
+  /// Get capture stream where event is recorded
+  hipStream_t GetCaptureStream() { return captureStream_; }
+  /// Set capture stream where event is recorded
+  void SetCaptureStream(hipStream_t stream) { captureStream_ = stream; }
+  /// Returns previous captured nodes before event record
+  std::vector<hipGraphNode_t> GetNodesPrevToRecorded() const { return nodesPrevToRecorded_; }
+  /// Set last captured graph node before event record
+  void SetNodesPrevToRecorded(std::vector<hipGraphNode_t>& graphNode) {
+    nodesPrevToRecorded_ = graphNode;
+  }
+
+  // IPC Events
+  struct ihipIpcEvent_t {
+    std::string ipc_name_;
+    int ipc_fd_;
+    ihipIpcEventShmem_t* ipc_shmem_;
+    ihipIpcEvent_t(): ipc_name_("dummy"), ipc_fd_(0), ipc_shmem_(nullptr) {
+    }
+    void setipcname(const char* name) {
+      ipc_name_ = std::string(name);
+    }
+  };
+  ihipIpcEvent_t ipc_evt_;
+private:
+  amd::Monitor lock_;
+  amd::HostQueue* stream_;
+  amd::Event* event_;
+  int device_id_;
+  //! Flag to indicate hipEventRecord has been called. This is needed except for
+  //! hip*ModuleLaunchKernel API which takes start and stop events so no
+  //! hipEventRecord is called. Cleanup needed once those APIs are deprecated.
+  bool recorded_;
+
+  bool ready();
+  int64_t time() const;
+};
+
+};
+
+#endif // HIP_EVEMT_H
diff --git a/rocclr/hip_fatbin.cpp b/rocclr/hip_fatbin.cpp
new file mode 100755
index 0000000000..04ae382bdf
--- /dev/null
+++ b/rocclr/hip_fatbin.cpp
@@ -0,0 +1,158 @@
+#include "hip_fatbin.hpp"
+
+#include "hip_code_object.hpp"
+
+namespace hip {
+
+FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
+  if (program_ != nullptr) {
+    program_->release();
+    program_ = nullptr;
+  }
+}
+
+FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
+               : fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) {
+
+  if (fname != nullptr) {
+    fname_ = std::string(fname);
+  } else {
+    fname_ = std::string();
+  }
+
+  fatbin_dev_info_.resize(g_devices.size());
+}
+
+FatBinaryInfo::~FatBinaryInfo() {
+
+  for (auto& fbd: fatbin_dev_info_) {
+    delete fbd;
+  }
+
+  if (fdesc_ > 0) {
+    if (fsize_ && !amd::Os::MemoryUnmapFile(image_, fsize_)) {
+      guarantee(false, "Cannot unmap file");
+    }
+    if (!amd::Os::CloseFileHandle(fdesc_)) {
+      guarantee(false, "Cannot close file");
+    }
+  }
+
+  fname_ = std::string();
+  fdesc_ = amd::Os::FDescInit();
+  fsize_ = 0;
+  image_ = nullptr;
+  uri_ = std::string();
+}
+
+hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
+  hipError_t hip_error = hipSuccess;
+  std::vector<std::pair<const void*, size_t>> code_objs;
+
+  // Copy device names for Extract Code object File
+  std::vector<std::string> device_names;
+  device_names.reserve(devices.size());
+  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+    device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
+  }
+
+  // We are given file name, get the file desc and file size
+  if (fname_.size() > 0) {
+    // Get File Handle & size of the file.
+    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
+      return hipErrorFileNotFound;
+    }
+    if (fsize_ == 0) {
+      return hipErrorInvalidKernelFile;
+    }
+
+    // Extract the code object from file
+    hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
+                device_names, code_objs);
+
+  } else if (image_ != nullptr) {
+    // We are directly given image pointer directly, try to extract file desc & file Size
+    hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
+                device_names, code_objs, uri_);
+  } else {
+    return hipErrorInvalidValue;
+  }
+
+  if (hip_error == hipErrorNoBinaryForGpu) {
+    guarantee(false, "hipErrorNoBinaryForGpu: Couldn't find binary for current devices!");
+    return hip_error;
+  }
+
+  if (hip_error == hipErrorInvalidKernelFile) {
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      // the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]
+        = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
+    }
+  } else if(hip_error == hipSuccess) {
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      // Calculate the offset wrt binary_image and the original image
+      size_t offset_l
+        = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
+            - reinterpret_cast<address>(const_cast<void*>(image_)));
+
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]
+        = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
+    }
+  }
+
+  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+    fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
+       = new amd::Program(*devices[dev_idx]->asContext());
+    if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
+      return hipErrorOutOfMemory;
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
+  // Device Id bounds Check
+  DeviceIdCheck(device_id);
+
+  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
+  // If fat binary was already added, skip this step and return success
+  if (fbd_info->add_dev_prog_ == false) {
+    amd::Context* ctx = g_devices[device_id]->asContext();
+    if (CL_SUCCESS != fbd_info->program_->addDeviceProgram(*ctx->devices()[0],
+                                          fbd_info->binary_image_,
+                                          fbd_info->binary_size_, false,
+                                          nullptr, nullptr, fdesc_,
+                                          fbd_info->binary_offset_, uri_)) {
+      return hipErrorInvalidKernelFile;
+    }
+    fbd_info->add_dev_prog_ = true;
+  }
+  return hipSuccess;
+}
+
+hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
+
+  // Device Id Check and Add DeviceProgram if not added so far
+  DeviceIdCheck(device_id);
+  IHIP_RETURN_ONFAIL(AddDevProgram(device_id));
+
+  // If Program was already built skip this step and return success
+  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
+  if (fbd_info->prog_built_ == false) {
+    if(CL_SUCCESS != fbd_info->program_->build(g_devices[device_id]->devices(),
+                                               nullptr, nullptr, nullptr,
+                                               kOptionChangeable, kNewDevProg)) {
+      return hipErrorSharedObjectInitFailed;
+    }
+    fbd_info->prog_built_ = true;
+  }
+
+  if (!fbd_info->program_->load()) {
+    return hipErrorSharedObjectInitFailed;
+  }
+  return hipSuccess;
+}
+
+} //namespace : hip
diff --git a/rocclr/hip_fatbin.hpp b/rocclr/hip_fatbin.hpp
new file mode 100755
index 0000000000..219a96e802
--- /dev/null
+++ b/rocclr/hip_fatbin.hpp
@@ -0,0 +1,87 @@
+#ifndef HIP_FAT_BINARY_HPP
+#define HIP_FAT_BINARY_HPP
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+
+namespace hip {
+
+//Fat Binary Per Device info
+class FatBinaryDeviceInfo {
+public:
+  FatBinaryDeviceInfo (const void* binary_image, size_t binary_size, size_t binary_offset)
+                      : binary_image_(binary_image), binary_size_(binary_size),
+                        binary_offset_(binary_offset), program_(nullptr),
+                        add_dev_prog_(false), prog_built_(false) {}
+
+  ~FatBinaryDeviceInfo();
+
+private:
+  const void* binary_image_; // binary image ptr
+  size_t binary_size_;       // binary image size
+  size_t binary_offset_;     // image offset from original
+
+  amd::Program* program_;    // reinterpreted as hipModule_t
+  friend class FatBinaryInfo;
+
+  //Control Variables
+  bool add_dev_prog_;
+  bool prog_built_;
+};
+
+
+// Fat Binary Info
+class FatBinaryInfo {
+public:
+  FatBinaryInfo(const char* fname, const void* image);
+  ~FatBinaryInfo();
+
+  // Loads Fat binary from file or image, unbundles COs for devices.
+  hipError_t ExtractFatBinary(const std::vector<hip::Device*>& devices);
+  hipError_t AddDevProgram(const int device_id);
+  hipError_t BuildProgram(const int device_id);
+
+
+  // Device Id bounds check
+  inline void DeviceIdCheck(const int device_id) const {
+    guarantee(device_id >= 0, "Invalid DeviceId less than 0");
+    guarantee(static_cast<size_t>(device_id) < fatbin_dev_info_.size(), "Invalid DeviceId, greater than no of fatbin device info!");
+  }
+
+  // Getter Methods
+  amd::Program* GetProgram(int device_id) {
+    DeviceIdCheck(device_id);
+    return fatbin_dev_info_[device_id]->program_;
+  }
+
+  hipModule_t Module(int device_id) const {
+    DeviceIdCheck(device_id);
+    return reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
+  }
+
+  hipError_t GetModule(int device_id, hipModule_t* hmod) const {
+    DeviceIdCheck(device_id);
+    *hmod = reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
+    return hipSuccess;
+  }
+
+private:
+  std::string fname_;        // File name
+  amd::Os::FileDesc fdesc_;  // File descriptor
+  size_t fsize_;             // Total file size
+
+  // Even when file is passed image will be mmapped till ~desctructor.
+  const void* image_;        // Image
+
+  // Only used for FBs where image is directly passed
+  std::string uri_;          // Uniform resource indicator
+
+  // Per Device Info, like corresponding binary ptr, size.
+  std::vector<FatBinaryDeviceInfo*> fatbin_dev_info_;
+};
+
+}; /* namespace hip */
+
+#endif /* HIP_FAT_BINARY_HPP */
diff --git a/rocclr/hip_formatting.hpp b/rocclr/hip_formatting.hpp
new file mode 100644
index 0000000000..83f1467a7b
--- /dev/null
+++ b/rocclr/hip_formatting.hpp
@@ -0,0 +1,853 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime_api.h>
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureFilterMode& s) {
+  switch (s) {
+    case hipFilterModePoint:
+      os << "hipFilterModePoint";
+      break;
+    case hipFilterModeLinear:
+      os << "hipFilterModeLinear";
+      break;
+    default:
+      os << "hipFilterModePoint";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureReadMode& s) {
+  switch (s) {
+    case hipReadModeElementType:
+      os << "hipReadModeElementType";
+      break;
+    case hipReadModeNormalizedFloat:
+      os << "hipReadModeNormalizedFloat";
+      break;
+    default:
+      os << "hipReadModeElementType";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureAddressMode& s) {
+  switch (s) {
+    case hipAddressModeWrap:
+      os << "hipAddressModeWrap";
+      break;
+    case hipAddressModeClamp:
+      os << "hipAddressModeClamp";
+      break;
+    case hipAddressModeMirror:
+      os << "hipAddressModeMirror";
+      break;
+    case hipAddressModeBorder:
+      os << "hipAddressModeBorder";
+      break;
+    default:
+      os << "hipAddressModeWrap";
+  };
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipMemcpyKind& s) {
+  switch (s) {
+    case hipMemcpyHostToHost:
+      os << "hipMemcpyHostToHost";
+      break;
+    case hipMemcpyHostToDevice:
+      os << "hipMemcpyHostToDevice";
+      break;
+    case hipMemcpyDeviceToHost:
+      os << "hipMemcpyDeviceToHost";
+      break;
+    case hipMemcpyDeviceToDevice:
+      os << "hipMemcpyDeviceToDevice";
+      break;
+    case hipMemcpyDefault:
+      os << "hipMemcpyDefault";
+      break;
+    default:
+      os << "hipMemcpyDefault";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatKind& s) {
+  switch (s) {
+    case hipChannelFormatKindSigned:
+      os << "hipChannelFormatKindSigned";
+      break;
+    case hipChannelFormatKindUnsigned:
+      os << "hipMemcpyHostToDevice";
+      break;
+    case hipChannelFormatKindFloat:
+      os << "hipChannelFormatKindFloat";
+      break;
+    case hipChannelFormatKindNone:
+      os << "hipChannelFormatKindNone";
+      break;
+    default:
+      os << "hipChannelFormatKindNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray_Format& s) {
+  switch (s) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT8";
+      break;
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT16";
+      break;
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT32";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      os << "HIP_AD_FORMAT_SIGNED_INT8";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT16:
+      os << "HIP_AD_FORMAT_SIGNED_INT16";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT32:
+      os << "HIP_AD_FORMAT_SIGNED_INT32";
+      break;
+    case HIP_AD_FORMAT_HALF:
+      os << "HIP_AD_FORMAT_HALF";
+      break;
+    case HIP_AD_FORMAT_FLOAT:
+      os << "HIP_AD_FORMAT_FLOAT";
+      break;
+    default:
+      os << "HIP_AD_FORMAT_FLOAT";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewFormat& s) {
+  switch (s) {
+    case hipResViewFormatNone:
+      os << "hipResViewFormatNone";
+      break;
+    case hipResViewFormatUnsignedChar1:
+      os << "hipResViewFormatUnsignedChar1";
+      break;
+    case hipResViewFormatUnsignedChar2:
+      os << "hipResViewFormatUnsignedChar2";
+      break;
+    case hipResViewFormatUnsignedChar4:
+      os << "hipResViewFormatUnsignedChar4";
+      break;
+    case hipResViewFormatSignedChar1:
+      os << "hipResViewFormatSignedChar1";
+      break;
+    case hipResViewFormatSignedChar2:
+      os << "hipResViewFormatSignedChar2";
+      break;
+    case hipResViewFormatSignedChar4:
+      os << "hipResViewFormatSignedChar4";
+      break;
+    case hipResViewFormatUnsignedShort1:
+      os << "hipResViewFormatUnsignedShort1";
+      break;
+    case hipResViewFormatUnsignedShort2:
+      os << "hipResViewFormatUnsignedShort2";
+      break;
+    case hipResViewFormatUnsignedShort4:
+      os << "hipResViewFormatUnsignedShort4";
+      break;
+    case hipResViewFormatSignedShort1:
+      os << "hipResViewFormatSignedShort1";
+      break;
+    case hipResViewFormatSignedShort2:
+      os << "hipResViewFormatSignedShort2";
+      break;
+    case hipResViewFormatSignedShort4:
+      os << "hipResViewFormatSignedShort4";
+      break;
+    case hipResViewFormatUnsignedInt1:
+      os << "hipResViewFormatUnsignedInt1";
+      break;
+    case hipResViewFormatUnsignedInt2:
+      os << "hipResViewFormatUnsignedInt2";
+      break;
+    case hipResViewFormatUnsignedInt4:
+      os << "hipResViewFormatUnsignedInt4";
+      break;
+    case hipResViewFormatSignedInt1:
+      os << "hipResViewFormatSignedInt1";
+      break;
+    case hipResViewFormatSignedInt2:
+      os << "hipResViewFormatSignedInt2";
+      break;
+    case hipResViewFormatSignedInt4:
+      os << "hipResViewFormatSignedInt4";
+      break;
+    case hipResViewFormatHalf1:
+      os << "hipResViewFormatHalf1";
+      break;
+    case hipResViewFormatHalf2:
+      os << "hipResViewFormatHalf2";
+      break;
+    case hipResViewFormatHalf4:
+      os << "hipResViewFormatHalf4";
+      break;
+    case hipResViewFormatFloat1:
+      os << "hipResViewFormatFloat1";
+      break;
+    case hipResViewFormatFloat2:
+      os << "hipResViewFormatFloat2";
+      break;
+    case hipResViewFormatFloat4:
+      os << "hipResViewFormatFloat4";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed1:
+      os << "hipResViewFormatUnsignedBlockCompressed1";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed2:
+      os << "hipResViewFormatUnsignedBlockCompressed2";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed3:
+      os << "hipResViewFormatUnsignedBlockCompressed3";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed4:
+      os << "hipResViewFormatUnsignedBlockCompressed4";
+      break;
+    case hipResViewFormatSignedBlockCompressed4:
+      os << "hipResViewFormatSignedBlockCompressed4";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed5:
+      os << "hipResViewFormatUnsignedBlockCompressed5";
+      break;
+    case hipResViewFormatSignedBlockCompressed5:
+      os << "hipResViewFormatSignedBlockCompressed5";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed6H:
+      os << "hipResViewFormatUnsignedBlockCompressed6H";
+      break;
+    case hipResViewFormatSignedBlockCompressed6H:
+      os << "hipResViewFormatSignedBlockCompressed6H";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed7:
+      os << "hipResViewFormatUnsignedBlockCompressed7";
+      break;
+    default:
+      os << "hipResViewFormatNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipFunction_attribute& s) {
+  switch (s) {
+    case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
+      os << "HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK";
+      break;
+    case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_NUM_REGS:
+      os << "HIP_FUNC_ATTRIBUTE_NUM_REGS";
+      break;
+    case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
+      os << "HIP_FUNC_ATTRIBUTE_PTX_VERSION";
+      break;
+    case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
+      os << "HIP_FUNC_ATTRIBUTE_BINARY_VERSION";
+      break;
+    case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
+      os << "HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA";
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
+      os << "HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT";
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX:
+      os << "HIP_FUNC_ATTRIBUTE_MAX";
+      break;
+    default:
+      os << "HIP_FUNC_ATTRIBUTE_MAX";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hiprtcResult& s) {
+  switch (s) {
+    case HIPRTC_SUCCESS:
+      os << "HIPRTC_SUCCESS";
+      break;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      os << "HIPRTC_ERROR_OUT_OF_MEMORY";
+      break;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      os << "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE";
+      break;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      os << "HIPRTC_ERROR_INVALID_INPUT";
+      break;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      os << "HIPRTC_ERROR_INVALID_PROGRAM";
+      break;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      os << "HIPRTC_ERROR_INVALID_OPTION";
+      break;
+    case HIPRTC_ERROR_COMPILATION:
+      os << "HIPRTC_ERROR_COMPILATION";
+      break;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      os << "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE";
+      break;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      os << "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION";
+      break;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      os << "IPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION";
+      break;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      os << "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID";
+      break;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      os << "HIPRTC_ERROR_INTERNAL_ERROR";
+      break;
+    default:
+      os << "HIPRTC_ERROR_INTERNAL_ERROR";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipJitOption& s) {
+  switch (s) {
+    case hipJitOptionMaxRegisters:
+      os << "hipJitOptionMaxRegisters";
+      break;
+    case hipJitOptionThreadsPerBlock:
+      os << "hipJitOptionThreadsPerBlock";
+      break;
+    case hipJitOptionWallTime:
+      os << "hipJitOptionWallTime";
+      break;
+    case hipJitOptionInfoLogBuffer:
+      os << "hipJitOptionInfoLogBuffer";
+      break;
+    case hipJitOptionInfoLogBufferSizeBytes:
+      os << "hipJitOptionInfoLogBufferSizeBytes";
+      break;
+    case hipJitOptionErrorLogBuffer:
+      os << "hipJitOptionErrorLogBuffer";
+      break;
+    case hipJitOptionErrorLogBufferSizeBytes:
+      os << "hipJitOptionErrorLogBufferSizeBytes";
+      break;
+    case hipJitOptionOptimizationLevel:
+      os << "hipJitOptionOptimizationLevel";
+      break;
+    case hipJitOptionTargetFromContext:
+      os << "hipJitOptionTargetFromContext";
+      break;
+    case hipJitOptionTarget:
+      os << "hipJitOptionTarget";
+      break;
+    case hipJitOptionFallbackStrategy:
+      os << "hipJitOptionFallbackStrategy";
+      break;
+    case hipJitOptionGenerateDebugInfo:
+      os << "hipJitOptionGenerateDebugInfo";
+      break;
+    case hipJitOptionCacheMode:
+      os << "hipJitOptionCacheMode";
+      break;
+    case hipJitOptionSm3xOpt:
+      os << "hipJitOptionSm3xOpt";
+      break;
+    case hipJitOptionFastCompile:
+      os << "hipJitOptionFastCompile";
+      break;
+    case hipJitOptionNumOptions:
+      os << "hipJitOptionNumOptions";
+      break;
+    default:
+      os << "hipJitOptionMaxRegisters";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipFuncCache_t& s) {
+  switch (s) {
+    case hipFuncCachePreferNone:
+      os << "hipFuncCachePreferNone";
+      break;
+    case hipFuncCachePreferShared:
+      os << "hipFuncCachePreferShared";
+      break;
+    case hipFuncCachePreferL1:
+      os << "hipFuncCachePreferL1";
+      break;
+    case hipFuncCachePreferEqual:
+      os << "hipFuncCachePreferEqual";
+      break;
+    default:
+      os << "hipFuncCachePreferNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipSharedMemConfig& s) {
+  switch (s) {
+    case hipSharedMemBankSizeDefault:
+      os << "hipSharedMemBankSizeDefault";
+      break;
+    case hipSharedMemBankSizeFourByte:
+      os << "hipSharedMemBankSizeFourByte";
+      break;
+    case hipSharedMemBankSizeEightByte:
+      os << "hipSharedMemBankSizeEightByte";
+      break;
+    default:
+      os << "hipSharedMemBankSizeDefault";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipDataType& s) {
+  switch (s) {
+    case HIP_R_16F:
+      os << "HIP_R_16F";
+      break;
+    case HIP_R_32F:
+      os << "HIP_R_32F";
+      break;
+    case HIP_R_64F:
+      os << "HIP_R_64F";
+      break;
+    case HIP_C_16F:
+      os << "HIP_C_16F";
+      break;
+    case HIP_C_32F:
+      os << "HIP_C_32F";
+      break;
+    case HIP_C_64F:
+      os << "HIP_C_64F";
+      break;
+    default:
+      os << "HIP_R_16F";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipLibraryPropertyType& s) {
+  switch (s) {
+    case HIP_LIBRARY_MAJOR_VERSION:
+      os << "HIP_LIBRARY_MAJOR_VERSION";
+      break;
+    case HIP_LIBRARY_MINOR_VERSION:
+      os << "HIP_LIBRARY_MINOR_VERSION";
+      break;
+    case HIP_LIBRARY_PATCH_LEVEL:
+      os << "HIP_LIBRARY_PATCH_LEVEL";
+      break;
+    default:
+      os << "HIP_LIBRARY_MAJOR_VERSION";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t& s) {
+  os << hip_api_name(s);
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc& s) {
+  os << '{'
+  << '{'
+  << s.addressMode[0]
+  << ','
+  << s.addressMode[1]
+  << ','
+  << s.addressMode[2]
+  << '}'
+  << ','
+  << s.filterMode
+  << ','
+  << s.readMode
+  << ','
+  << s.sRGB
+  << ','
+  << '{'
+  << s.borderColor[0]
+  << ','
+  << s.borderColor[1]
+  << ','
+  << s.borderColor[2]
+  << ','
+  << s.borderColor[3]
+  << '}'
+  << ','
+  << s.normalizedCoords
+  << ','
+  << s.mipmapFilterMode
+  << ','
+  << s.mipmapLevelBias
+  << ','
+  << s.minMipmapLevelClamp
+  << ','
+  << s.maxMipmapLevelClamp
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
+  os << '{'
+  << s.x
+  << ','
+  << s.y
+  << ','
+  << s.z
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const dim3* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc& s) {
+  os << '{'
+  << s.x
+  << ','
+  << s.y
+  << ','
+  << s.z
+  << ','
+  << s.w
+  << ','
+  << s.f
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray& s) {
+  os << '{'
+  << s.data
+  << ','
+  << s.desc
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc& s) {
+  os << '{'
+  << s.resType
+  << ','
+  << '{';
+
+  switch (s.resType) {
+  case hipResourceTypeLinear:
+    os << s.res.linear.devPtr
+    << ','
+    << s.res.linear.desc
+    << ','
+    << s.res.linear.sizeInBytes;
+    break;
+  case hipResourceTypePitch2D:
+    os << s.res.pitch2D.devPtr
+    << ','
+    << s.res.pitch2D.desc
+    << ','
+    << s.res.pitch2D.width
+    << ','
+    << s.res.pitch2D.height
+    << ','
+    << s.res.pitch2D.pitchInBytes;
+    break;
+  case hipResourceTypeArray:
+    os << s.res.array.array;
+    break;
+  case hipResourceTypeMipmappedArray:
+    os <<s.res.mipmap.mipmap;
+    break;
+  default:
+    break;
+  }
+
+  os << '}';
+
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray& s) {
+  os << '{'
+  << s.data
+  << ','
+  << s.desc
+  << ','
+  << s.type
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << ','
+  << s.isDrv
+  << ','
+  << s.textureType
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const textureReference& s) {
+  os << '{'
+  << s.normalized
+  << ','
+  << s.readMode
+  << ','
+  << s.filterMode
+  << ','
+  << '{'
+  << s.addressMode[0]
+  << ','
+  << s.addressMode[1]
+  << ','
+  << s.addressMode[2]
+  << '}'
+  << ','
+  << s.channelDesc
+  << ','
+  << s.sRGB
+  << ','
+  << s.maxAnisotropy
+  << ','
+  << s.mipmapFilterMode
+  << ','
+  << s.mipmapLevelBias
+  << ','
+  << s.minMipmapLevelClamp
+  << ','
+  << s.maxMipmapLevelClamp
+  << ','
+  << s.textureObject
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const textureReference* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipError_t& s) {
+  os << hipGetErrorName(s);
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipError_t* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc& s) {
+  os << '{'
+  << s.format
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << ','
+  << s.firstMipmapLevel
+  << ','
+  << s.lastMipmapLevel
+  << ','
+  << s.firstLayer
+  << ','
+  << s.lastLayer
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR& s) {
+  os << '{'
+  << s.Width
+  << ','
+  << s.Height
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR& s) {
+  os << '{'
+  << s.Width
+  << ','
+  << s.Height
+  << ','
+  << s.Depth
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << ','
+  << s.Flags
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipExtent& s) {
+  os << '{'
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t& s) {
+  //TODO fill in later
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t* s) {
+  //TODO fill in later
+  return os;
+}
diff --git a/rocclr/hip_global.cpp b/rocclr/hip_global.cpp
new file mode 100755
index 0000000000..c5025d978f
--- /dev/null
+++ b/rocclr/hip_global.cpp
@@ -0,0 +1,196 @@
+#include "hip_global.hpp"
+
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_code_object.hpp"
+#include "platform/program.hpp"
+
+namespace hip {
+
+//Device Vars
+DeviceVar::DeviceVar(std::string name, hipModule_t hmod) : shadowVptr(nullptr), name_(name),
+                                                           amd_mem_obj_(nullptr), device_ptr_(nullptr),
+                                                           size_(0) {
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
+  device::Program* dev_program = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+  if (dev_program == nullptr) {
+    LogPrintfError("Cannot get Device Program for module: 0x%x \n", hmod);
+    guarantee(false, "Cannot get Device Program");
+  }
+
+  if(!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
+    LogPrintfError("Cannot create Global Var obj for symbol: %s \n", name.c_str());
+    guarantee(false, "Cannot create GlobalVar Obj");
+  }
+
+  // Handle size 0 symbols
+  if (size_ != 0) {
+    if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) {
+      LogPrintfError("Cannot get memory for creating device Var: %s", name.c_str());
+      guarantee(false, "Cannot get memory for creating device var");
+    }
+    amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_);
+  }
+}
+
+DeviceVar::~DeviceVar() {
+  if (amd_mem_obj_ != nullptr) {
+    amd::MemObjMap::RemoveMemObj(device_ptr_);
+    amd_mem_obj_->release();
+  }
+
+  if (shadowVptr != nullptr) {
+    textureReference* texRef = reinterpret_cast<textureReference*>(shadowVptr);
+    delete texRef;
+    shadowVptr = nullptr;
+  }
+
+  device_ptr_ = nullptr;
+  size_ = 0;
+}
+
+//Device Functions
+DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function lock"),
+                       name_(name), kernel_(nullptr) {
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
+
+  const amd::Symbol *symbol = program->findSymbol(name.c_str());
+  if (symbol == nullptr) {
+    LogPrintfError("Cannot find Symbol with name: %s \n", name.c_str());
+    guarantee(false, "Cannot find Symbol");
+  }
+
+  kernel_ = new amd::Kernel(*program, *symbol, name);
+  if (kernel_ == nullptr) {
+    LogPrintfError("Cannot create kernel with name: %s \n", name.c_str());
+    guarantee(false, "Cannot Create kernel");
+  }
+}
+
+DeviceFunc::~DeviceFunc() {
+  if (kernel_ != nullptr) {
+    kernel_->release();
+  }
+}
+
+//Abstract functions
+Function::Function(std::string name, FatBinaryInfo** modules)
+                   : name_(name), modules_(modules) {
+  dFunc_.resize(g_devices.size());
+}
+
+Function::~Function() {
+  for (auto& elem : dFunc_) {
+    delete elem;
+  }
+  name_ = "";
+  modules_ = nullptr;
+}
+
+hipError_t Function::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod) {
+  guarantee((dFunc_.size() == g_devices.size()), "dFunc Size mismatch");
+  if (dFunc_[ihipGetDevice()] == nullptr) {
+    dFunc_[ihipGetDevice()] = new DeviceFunc(name_, hmod);
+  }
+  *hfunc = dFunc_[ihipGetDevice()]->asHipFunction();
+
+  return hipSuccess;
+}
+
+hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
+  guarantee(modules_ != nullptr, "Module not initialized");
+
+  hipModule_t hmod = nullptr;
+  IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+  IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+
+  if (dFunc_[deviceId] == nullptr) {
+    dFunc_[deviceId] = new DeviceFunc(name_, hmod);
+  }
+  *hfunc = dFunc_[deviceId]->asHipFunction();
+
+  return hipSuccess;
+}
+
+hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
+  guarantee((modules_ != nullptr), "Module not initialized");
+
+  hipModule_t hmod = nullptr;
+  IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+  IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+
+  if (dFunc_[deviceId] == nullptr) {
+    dFunc_[deviceId] = new DeviceFunc(name_, hmod);
+  }
+
+  const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
+
+  amd::Kernel* kernel = dFunc_[deviceId]->kernel();
+  const device::Kernel::WorkGroupInfo* wginfo = kernel->getDeviceKernel(*devices[deviceId])->workGroupInfo();
+  func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
+  func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
+  func_attr->cacheModeCA = 0;
+  func_attr->constSizeBytes = 0;
+  func_attr->localSizeBytes = wginfo->privateMemSize_;
+  func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
+                                                          - wginfo->localMemSize_);
+
+  func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
+  func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
+  func_attr->preferredShmemCarveout = 0;
+  func_attr->ptxVersion = 30;
+
+
+  return hipSuccess;
+}
+
+//Abstract Vars
+Var::Var(std::string name, DeviceVarKind dVarKind, size_t size, int type, int norm,
+         FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind), size_(size),
+         type_(type), norm_(norm), modules_(modules) {
+  dVar_.resize(g_devices.size());
+}
+
+Var::Var(std::string name, DeviceVarKind dVarKind, void *pointer, size_t size,
+         unsigned align, FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind),
+         size_(size), modules_(modules), managedVarPtr_(pointer), align_(align) {
+  dVar_.resize(g_devices.size());
+}
+
+Var::~Var() {
+  for (auto& elem : dVar_) {
+    delete elem;
+  }
+  modules_ = nullptr;
+}
+
+hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
+  guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
+  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
+            "Invalid DeviceId, greater than no of code objects");
+  guarantee((dVar_.size() == g_devices.size()),
+             "Device Var not initialized to size");
+
+  if (dVar_[deviceId] == nullptr) {
+    dVar_[deviceId] = new DeviceVar(name_, hmod);
+  }
+
+  *dvar = dVar_[deviceId];
+  return hipSuccess;
+}
+
+hipError_t Var::getStatDeviceVar(DeviceVar** dvar, int deviceId) {
+  guarantee((deviceId >= 0) , "Invalid DeviceId, less than zero");
+  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
+            "Invalid DeviceId, greater than no of code objects");
+  if (dVar_[deviceId] == nullptr) {
+    hipModule_t hmod = nullptr;
+    IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+    IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+    dVar_[deviceId] = new DeviceVar(name_, hmod);
+  }
+  *dvar = dVar_[deviceId];
+  return hipSuccess;
+}
+
+}; //namespace: hip
diff --git a/rocclr/hip_global.hpp b/rocclr/hip_global.hpp
new file mode 100755
index 0000000000..55f0027d58
--- /dev/null
+++ b/rocclr/hip_global.hpp
@@ -0,0 +1,119 @@
+#ifndef HIP_GLOBAL_HPP
+#define HIP_GLOBAL_HPP
+
+#include <vector>
+#include <string>
+
+#include "hip/hip_runtime_api.h"
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_fatbin.hpp"
+#include "platform/program.hpp"
+
+namespace hip {
+
+//Forward Declaration
+class CodeObject;
+
+//Device Structures
+class DeviceVar {
+public:
+  DeviceVar(std::string name, hipModule_t hmod);
+  ~DeviceVar();
+
+  //Accessors for device ptr and size, populated during constructor.
+  hipDeviceptr_t device_ptr() const { return device_ptr_; }
+  size_t size() const { return size_; }
+  std::string name() const { return name_; }
+  void* shadowVptr;
+
+private:
+  std::string name_;           //Name of the var
+  amd::Memory* amd_mem_obj_;   //amd_mem_obj abstraction
+  hipDeviceptr_t device_ptr_;  //Device Pointer
+  size_t size_;                //Size of the var
+};
+
+class DeviceFunc {
+public:
+  DeviceFunc(std::string name, hipModule_t hmod);
+  ~DeviceFunc();
+
+  amd::Monitor dflock_;
+
+  //Converts DeviceFunc to hipFunction_t(used by app) and vice versa.
+  hipFunction_t asHipFunction() { return reinterpret_cast<hipFunction_t>(this); }
+  static DeviceFunc* asFunction(hipFunction_t f) { return reinterpret_cast<DeviceFunc*>(f); }
+
+  //Accessor for kernel_ and name_ populated during constructor.
+  std::string name() const { return name_; }
+  amd::Kernel* kernel() const { return kernel_; }
+
+private:
+  std::string name_;        //name of the func(not unique identifier)
+  amd::Kernel* kernel_;     //Kernel ptr referencing to ROCclr Symbol
+};
+
+//Abstract Structures
+class Function {
+public:
+  Function(std::string name, FatBinaryInfo** modules=nullptr);
+  ~Function();
+
+  //Return DeviceFunc for this this dynamically loaded module
+  hipError_t getDynFunc(hipFunction_t* hfunc, hipModule_t hmod);
+
+  //Return Device Func & attr . Generate/build if not already done so.
+  hipError_t getStatFunc(hipFunction_t *hfunc, int deviceId);
+  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId);
+  void resize_dFunc(size_t size) { dFunc_.resize(size); }
+  FatBinaryInfo** moduleInfo() { return modules_; };
+
+private:
+  std::vector<DeviceFunc*> dFunc_;  //DeviceFuncObj per Device
+  std::string name_;                //name of the func(not unique identifier)
+  FatBinaryInfo** modules_;      // static module where it is referenced
+};
+
+class Var {
+public:
+  //Types of variable
+  enum DeviceVarKind {
+    DVK_Variable = 0,
+    DVK_Surface,
+    DVK_Texture,
+    DVK_Managed
+  };
+
+  Var(std::string name, DeviceVarKind dVarKind, size_t size, int type, int norm,
+      FatBinaryInfo** modules = nullptr);
+
+  Var(std::string name, DeviceVarKind dVarKind, void *pointer, size_t size, unsigned align,
+      FatBinaryInfo** modules = nullptr);
+
+  ~Var();
+
+  //Return DeviceVar for this dynamically loaded module
+  hipError_t getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod);
+
+  //Return DeviceVar for module Generate/build if not already done so.
+  hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
+  void resize_dVar(size_t size) { dVar_.resize(size); }
+
+  FatBinaryInfo** moduleInfo() { return modules_; };
+  void* getManagedVarPtr() { return managedVarPtr_; };
+private:
+  std::vector<DeviceVar*> dVar_;   // DeviceVarObj per Device
+  std::string name_;               // Variable name (not unique identifier)
+  DeviceVarKind dVarKind_;         // Variable kind
+  size_t size_;                    // Size of the variable
+  int type_;                       // Type(Textures/Surfaces only)
+  int norm_;                       // Type(Textures/Surfaces only)
+  FatBinaryInfo** modules_;        // static module where it is referenced
+
+  void *managedVarPtr_;            // Managed memory pointer with size_ & align_
+  unsigned int align_;             // Managed memory alignment
+};
+
+}; //namespace: hip
+#endif /* HIP_GLOBAL_HPP */
diff --git a/rocclr/hip_graph.cpp b/rocclr/hip_graph.cpp
new file mode 100644
index 0000000000..a8f5df5100
--- /dev/null
+++ b/rocclr/hip_graph.cpp
@@ -0,0 +1,415 @@
+/* Copyright (c) 2021-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "hip_graph_internal.hpp"
+#include "platform/command.hpp"
+#include "hip_conversions.hpp"
+#include "hip_platform.hpp"
+#include "hip_event.hpp"
+
+thread_local std::vector<hipStream_t> g_captureStreams;
+std::unordered_map<amd::Command*, hipGraphExec_t> hipGraphExec::activeGraphExec_;
+
+hipError_t ihipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                  const hipKernelNodeParams* pNodeParams) {
+  if (pGraphNode == nullptr || graph == nullptr ||
+      (numDependencies > 0 && pDependencies == nullptr) || pNodeParams == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  hipFunction_t func = nullptr;
+  hipError_t status =
+      PlatformState::instance().getStatFunc(&func, pNodeParams->func, ihipGetDevice());
+  if ((status != hipSuccess) || (func == nullptr)) {
+    return hipErrorInvalidDeviceFunction;
+  }
+  size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
+  size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
+  size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    return hipErrorInvalidConfiguration;
+  }
+  status = ihipLaunchKernel_validate(
+      func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
+      static_cast<uint32_t>(globalWorkSizeZ), pNodeParams->blockDim.x, pNodeParams->blockDim.y,
+      pNodeParams->blockDim.z, pNodeParams->sharedMemBytes, pNodeParams->kernelParams,
+      pNodeParams->extra, ihipGetDevice(), 0);
+  if (status != hipSuccess) {
+    return status;
+  }
+  *pGraphNode = new hipGraphKernelNode(pNodeParams, func);
+  if (numDependencies == 0) {
+    graph->AddNode(*pGraphNode);
+  }
+  for (size_t i = 0; i < numDependencies; i++) {
+    if (graph->AddEdge(*(pDependencies + i), *pGraphNode) != hipSuccess) {
+      return hipErrorInvalidValue;
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                  const hipMemcpy3DParms* pCopyParams) {
+  if (pGraphNode == nullptr || graph == nullptr ||
+      (numDependencies > 0 && pDependencies == nullptr) || pCopyParams == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  ihipMemcpy3D_validate(pCopyParams);
+  *pGraphNode = new hipGraphMemcpyNode(pCopyParams);
+  if (numDependencies == 0) {
+    graph->AddNode(*pGraphNode);
+  }
+  for (size_t i = 0; i < numDependencies; i++) {
+    if (graph->AddEdge(*(pDependencies + i), *pGraphNode) != hipSuccess) {
+      return hipErrorInvalidValue;
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                  const hipMemsetParams* pMemsetParams) {
+  if (pGraphNode == nullptr || graph == nullptr ||
+      (numDependencies > 0 && pDependencies == nullptr) || pMemsetParams == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  if (pMemsetParams->height == 1) {
+    ihipMemset_validate(pMemsetParams->dst, pMemsetParams->value, pMemsetParams->elementSize,
+                        pMemsetParams->width * pMemsetParams->elementSize);
+  } else {
+    auto sizeBytes = pMemsetParams->width * pMemsetParams->height * 1;
+    ihipMemset3D_validate(
+        {pMemsetParams->dst, pMemsetParams->pitch, pMemsetParams->width, pMemsetParams->height},
+        pMemsetParams->value, {pMemsetParams->width, pMemsetParams->height, 1}, sizeBytes);
+  }
+
+  *pGraphNode = new hipGraphMemsetNode(pMemsetParams);
+  if (numDependencies == 0) {
+    graph->AddNode(*pGraphNode);
+  }
+  for (size_t i = 0; i < numDependencies; i++) {
+    if (graph->AddEdge(*(pDependencies + i), *pGraphNode) != hipSuccess) {
+      return hipErrorInvalidValue;
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t capturehipLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
+                                  dim3& blockDim, void**& args, size_t& sharedMemBytes) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API,
+          "[hipGraph] current capture node kernel launch on stream : %p", stream);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hipKernelNodeParams nodeParams;
+  nodeParams.func = const_cast<void*>(hostFunction);
+  nodeParams.blockDim = blockDim;
+  nodeParams.extra = nullptr;
+  nodeParams.gridDim = gridDim;
+  nodeParams.kernelParams = args;
+  nodeParams.sharedMemBytes = sharedMemBytes;
+
+  hipGraphNode_t pGraphNode;
+  hipError_t status =
+      ihipGraphAddKernelNode(&pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                             s->GetLastCapturedNodes().size(), &nodeParams);
+  if (status != hipSuccess) {
+    return status;
+  }
+  s->SetLastCapturedNode(pGraphNode);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& p) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] current capture node Memcpy3D on stream : %p",
+          stream);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hipGraphNode_t pGraphNode;
+  hipError_t status =
+      ihipGraphAddMemcpyNode(&pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                             s->GetLastCapturedNodes().size(), p);
+  if (status != hipSuccess) {
+    return status;
+  }
+  s->SetLastCapturedNode(pGraphNode);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& src,
+                                 size_t& sizeBytes, hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] current capture node Memcpy1D on stream : %p",
+          stream);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hipGraphNode_t pGraphNode;
+  hipGraph_t graph = nullptr;
+  std::vector<hipGraphNode_t> pDependencies = s->GetLastCapturedNodes();
+  size_t numDependencies = s->GetLastCapturedNodes().size();
+  graph = s->GetCaptureGraph();
+  ihipMemcpy_validate(dst, src, sizeBytes, kind);
+  pGraphNode = new hipGraphMemcpyNode1D(dst, src, sizeBytes, kind);
+  if (numDependencies == 0) {
+    graph->AddNode(pGraphNode);
+  }
+  for (size_t i = 0; i < numDependencies; i++) {
+    if (graph->AddEdge(pDependencies[i], pGraphNode) != hipSuccess) {
+      return hipErrorInvalidValue;
+    }
+  }
+  s->SetLastCapturedNode(pGraphNode);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, const void*& symbol,
+                                           size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API,
+          "[hipGraph] current capture node MemcpyFromSymbolNode on stream : %p", stream);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbol, const void*& src,
+                                         size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API,
+          "[hipGraph] current capture node MemcpyToSymbolNode on stream : %p", stream);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, size_t& valueSize,
+                                 size_t& sizeBytes) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] current capture node Memset1D on stream : %p",
+          stream);
+
+  hipMemsetParams memsetParams = {0};
+  memsetParams.dst = dst;
+  memsetParams.value = value;
+  memsetParams.elementSize = valueSize;
+  memsetParams.width = sizeBytes / valueSize;
+  memsetParams.height = 1;
+
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hipGraphNode_t pGraphNode;
+  hipError_t status =
+      ihipGraphAddMemsetNode(&pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                             s->GetLastCapturedNodes().size(), &memsetParams);
+  if (status != hipSuccess) {
+    return status;
+  }
+  s->SetLastCapturedNode(pGraphNode);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitch, int& value,
+                                   size_t& width, size_t& height) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] current capture node Memset2D on stream : %p",
+          stream);
+  hipMemsetParams memsetParams = {0};
+
+  memsetParams.dst = dst;
+  memsetParams.value = value;
+  memsetParams.width = width;
+  memsetParams.height = height;
+  memsetParams.pitch = pitch;
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hipGraphNode_t pGraphNode;
+  hipError_t status =
+      ihipGraphAddMemsetNode(&pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                             s->GetLastCapturedNodes().size(), &memsetParams);
+  if (status != hipSuccess) {
+    return status;
+  }
+  s->SetLastCapturedNode(pGraphNode);
+  return hipSuccess;
+}
+
+hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDevPtr, int& value,
+                                   hipExtent& extent) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] current capture node Memset3D on stream : %p",
+          stream);
+  return hipSuccess;
+}
+
+hipError_t capturehipEventRecord(hipStream_t& stream, hipEvent_t& event) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API,
+          "[hipGraph] current capture node EventRecord on stream : %p, Event %p", stream, event);
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  e->StartCapture(stream);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  std::vector<hipGraphNode_t> lastCapturedNodes = s->GetLastCapturedNodes();
+  if (!lastCapturedNodes.empty()) {
+    e->SetNodesPrevToRecorded(lastCapturedNodes);
+  }
+  return hipSuccess;
+}
+
+hipError_t capturehipStreamWaitEvent(hipEvent_t& event, hipStream_t& stream, unsigned int& flags) {
+  ClPrint(amd::LOG_INFO, amd::LOG_API,
+          "[hipGraph] current capture node StreamWaitEvent on stream : %p, Event %p", stream,
+          event);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+
+  if (event == nullptr || stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  if (!s->IsOriginStream()) {
+    s->SetCaptureGraph(reinterpret_cast<hip::Stream*>(e->GetCaptureStream())->GetCaptureGraph());
+    s->SetCaptureMode(reinterpret_cast<hip::Stream*>(e->GetCaptureStream())->GetCaptureMode());
+    s->SetParentStream(e->GetCaptureStream());
+  }
+  s->AddCrossCapturedNode(e->GetNodesPrevToRecorded());
+  g_captureStreams.push_back(stream);
+  return hipSuccess;
+}
+
+hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus** pCaptureStatus) {
+  HIP_INIT_API(hipStreamIsCapturing, stream, pCaptureStatus);
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hipStreamCaptureStatus captureStatus = reinterpret_cast<hip::Stream*>(stream)->GetCaptureStatus();
+  *pCaptureStatus = &captureStatus;
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) {
+  HIP_INIT_API(hipStreamBeginCapture, stream, mode);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  // capture cannot be initiated on legacy stream
+  // It can be initiated if the stream is not already in capture mode
+  if (stream == nullptr || s->GetCaptureStatus() == hipStreamCaptureStatusActive) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  s->SetCaptureGraph(new hipGraph());
+  s->SetCaptureMode(mode);
+  s->SetOriginStream();
+  g_captureStreams.push_back(stream);
+  HIP_RETURN_DURATION(hipSuccess);
+}
+
+hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph) {
+  HIP_INIT_API(hipStreamEndCapture, stream, pGraph);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  // Capture must be ended on the same stream in which it was initiated
+  if (!s->IsOriginStream()) {
+    HIP_RETURN(hipErrorStreamCaptureUnmatched);
+  }
+  // If mode is not hipStreamCaptureModeRelaxed, hipStreamEndCapture must be called on the stream
+  // from the same thread
+  if (s->GetCaptureMode() != hipStreamCaptureModeRelaxed &&
+      std::find(g_captureStreams.begin(), g_captureStreams.end(), stream) ==
+          g_captureStreams.end()) {
+    HIP_RETURN(hipErrorStreamCaptureWrongThread);
+  }
+  // If capture was invalidated, due to a violation of the rules of stream capture
+  if (s->GetCaptureStatus() == hipStreamCaptureStatusInvalidated) {
+    *pGraph = nullptr;
+    HIP_RETURN(hipErrorStreamCaptureInvalidated);
+  }
+  // check if all parallel streams have joined
+  if (s->GetCaptureGraph()->GetLeafNodeCount() != 1) {
+    return hipErrorStreamCaptureUnjoined;
+  }
+  *pGraph = s->GetCaptureGraph();
+  // end capture on all streams/events part of graph capture
+  HIP_RETURN_DURATION(s->EndCapture());
+}
+
+hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags) {
+  HIP_INIT_API(hipGraphCreate, pGraph, flags);
+  *pGraph = new hipGraph();
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphDestroy(hipGraph_t graph) {
+  HIP_INIT_API(hipGraphDestroy, graph);
+  delete graph;
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipKernelNodeParams* pNodeParams) {
+  HIP_INIT_API(hipGraphAddKernelNode, pGraphNode, graph, pDependencies, numDependencies,
+               pNodeParams);
+  HIP_RETURN_DURATION(
+      ihipGraphAddKernelNode(pGraphNode, graph, pDependencies, numDependencies, pNodeParams););
+}
+
+hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemcpy3DParms* pCopyParams) {
+  HIP_INIT_API(hipGraphAddMemcpyNode, pGraphNode, graph, pDependencies, numDependencies,
+               pCopyParams);
+
+  HIP_RETURN_DURATION(
+      ihipGraphAddMemcpyNode(pGraphNode, graph, pDependencies, numDependencies, pCopyParams););
+}
+
+hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                 const hipMemsetParams* pMemsetParams) {
+  HIP_INIT_API(hipGraphAddMemsetNode, pGraphNode, graph, pDependencies, numDependencies,
+               pMemsetParams);
+
+  HIP_RETURN_DURATION(
+      ihipGraphAddMemsetNode(pGraphNode, graph, pDependencies, numDependencies, pMemsetParams););
+}
+
+hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize) {
+  std::vector<std::vector<Node>> parallelLists;
+  std::unordered_map<Node, std::vector<Node>> nodeWaitLists;
+  graph->GetRunList(parallelLists, nodeWaitLists);
+  std::vector<Node> levelOrder;
+  graph->LevelOrder(levelOrder);
+  *pGraphExec = new hipGraphExec(levelOrder, parallelLists, nodeWaitLists);
+  if (*pGraphExec != nullptr) {
+    return (*pGraphExec)->Init();
+  } else {
+    return hipErrorOutOfMemory;
+  }
+}
+
+hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                               hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize) {
+  HIP_INIT_API(hipGraphInstantiate, pGraphExec, graph);
+  HIP_RETURN_DURATION(ihipGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize));
+}
+
+hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
+  HIP_INIT_API(hipGraphExecDestroy, pGraphExec);
+  delete pGraphExec;
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t ihipGraphlaunch(hipGraphExec_t graphExec, hipStream_t stream) {
+  return graphExec->Run(stream);
+}
+
+hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream) {
+  HIP_INIT_API(hipGraphLaunch, graphExec, stream);
+  HIP_RETURN_DURATION(ihipGraphlaunch(graphExec, stream));
+}
diff --git a/rocclr/hip_graph_capture.hpp b/rocclr/hip_graph_capture.hpp
new file mode 100644
index 0000000000..16abfdeef7
--- /dev/null
+++ b/rocclr/hip_graph_capture.hpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#pragma once
+// forward declaration of capture methods
+hipError_t capturehipLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
+                                  dim3& blockDim, void**& args, size_t& sharedMemBytes);
+
+hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& p);
+
+hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& src,
+                                 size_t& sizeBytes, hipMemcpyKind& kind);
+
+hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, const void*& symbol,
+                                           size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind);
+
+hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbol, const void*& src,
+                                         size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind);
+
+hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, size_t& valueSize,
+                                 size_t& sizeBytes);
+
+hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitch, int& value,
+                                   size_t& width, size_t& height);
+
+hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDevPtr, int& value,
+                                   hipExtent& extent);
+
+hipError_t capturehipEventRecord(hipStream_t& stream, hipEvent_t& event);
+
+hipError_t capturehipStreamWaitEvent(hipEvent_t& event, hipStream_t& stream, unsigned int& flags);
diff --git a/rocclr/hip_graph_helper.hpp b/rocclr/hip_graph_helper.hpp
new file mode 100644
index 0000000000..b28c14e811
--- /dev/null
+++ b/rocclr/hip_graph_helper.hpp
@@ -0,0 +1,35 @@
+hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p);
+
+hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+
+hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
+                             hipMemcpyKind kind, amd::HostQueue& queue);
+
+hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
+                                     uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                     uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
+                                     uint32_t sharedMemBytes, void** kernelParams, void** extra,
+                                     int deviceId, uint32_t params);
+
+hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize, size_t sizeBytes);
+
+hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
+                                 size_t sizeBytes);
+
+hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
+                                   uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
+                                   uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
+                                   uint32_t blockDimZ, uint32_t sharedMemBytes,
+                                   amd::HostQueue* queue, void** kernelParams, void** extra,
+                                   hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags,
+                                   uint32_t params, uint32_t gridId, uint32_t numGrids,
+                                   uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice);
+
+hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p,
+                               amd::HostQueue* queue);
+
+hipError_t ihipMemsetCommand(std::vector<amd::Command*>& commands, void* dst, int64_t value,
+                             size_t valueSize, size_t sizeBytes, amd::HostQueue* queue);
+
+hipError_t ihipMemset3DCommand(std::vector<amd::Command*>& commands, hipPitchedPtr pitchedDevPtr,
+                               int value, hipExtent extent, amd::HostQueue* queue);
diff --git a/rocclr/hip_graph_internal.cpp b/rocclr/hip_graph_internal.cpp
new file mode 100644
index 0000000000..75eb6f6726
--- /dev/null
+++ b/rocclr/hip_graph_internal.cpp
@@ -0,0 +1,364 @@
+/* Copyright (c) 2021-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "hip_graph_internal.hpp"
+#include <queue>
+
+#define CASE_STRING(X, C)                                                                          \
+  case X:                                                                                          \
+    case_string = #C;                                                                              \
+    break;
+const char* GetGraphNodeTypeString(uint32_t op) {
+  const char* case_string;
+  switch (static_cast<hipGraphNodeType>(op)) {
+    CASE_STRING(hipGraphNodeTypeKernel, KernelNode)
+    CASE_STRING(hipGraphNodeTypeMemcpy, Memcpy3DNode)
+    CASE_STRING(hipGraphNodeTypeMemset, MemsetNode)
+    CASE_STRING(hipGraphNodeTypeHost, HostNode)
+    CASE_STRING(hipGraphNodeTypeGraph, GraphNode)
+    CASE_STRING(hipGraphNodeTypeEmpty, EmptyNode)
+    CASE_STRING(hipGraphNodeTypeWaitEvent, WaitEventNode)
+    CASE_STRING(hipGraphNodeTypeEventRecord, EventRecordNode)
+    CASE_STRING(hipGraphNodeTypeMemcpy1D, Memcpy1DNode)
+    CASE_STRING(hipGraphNodeTypeMemcpyFromSymbol, MemcpyFromSymbolNode)
+    CASE_STRING(hipGraphNodeTypeMemcpyToSymbol, MemcpyToSymbolNode)
+    default:
+      case_string = "Unknown node type";
+  };
+  return case_string;
+};
+
+hipError_t hipGraph::AddNode(const Node& node) {
+  vertices_.emplace_back(node);
+  nodeOutDegree_[node] = 0;
+  nodeInDegree_[node] = 0;
+  node->SetLevel(0);
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)\n",
+          GetGraphNodeTypeString(node->GetType()), node);
+  return hipSuccess;
+}
+
+hipError_t hipGraph::AddEdge(const Node& parentNode, const Node& childNode) {
+  // if vertice doesn't exist, add it to the graph
+  if (std::find(vertices_.begin(), vertices_.end(), parentNode) == vertices_.end()) {
+    AddNode(parentNode);
+  }
+  if (std::find(vertices_.begin(), vertices_.end(), childNode) == vertices_.end()) {
+    AddNode(childNode);
+  }
+  // Check if edge already exists
+  auto connectedEdges = edges_.find(parentNode);
+  if (connectedEdges != edges_.end()) {
+    if (std::find(connectedEdges->second.begin(), connectedEdges->second.end(), childNode) !=
+        connectedEdges->second.end()) {
+      return hipSuccess;
+    }
+    connectedEdges->second.emplace_back(childNode);
+  } else {
+    edges_[parentNode] = {childNode};
+  }
+  nodeOutDegree_[parentNode]++;
+  nodeInDegree_[childNode]++;
+  childNode->SetLevel(std::max(childNode->GetLevel(), parentNode->GetLevel() + 1));
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add edge btwn %s(%p) - %s(%p)\n",
+          GetGraphNodeTypeString(parentNode->GetType()), parentNode,
+          GetGraphNodeTypeString(childNode->GetType()), childNode);
+  return hipSuccess;
+}
+
+// root nodes are all vertices with 0 in-degrees
+std::vector<Node> hipGraph::GetRootNodes() const {
+  std::vector<Node> roots;
+  for (auto entry : vertices_) {
+    if (nodeInDegree_.at(entry) == 0) {
+      roots.push_back(entry);
+      ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] root node: %s(%p)\n",
+              GetGraphNodeTypeString(entry->GetType()), entry);
+    }
+  }
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "\n");
+  return roots;
+}
+
+// leaf nodes are all vertices with 0 out-degrees
+std::vector<Node> hipGraph::GetLeafNodes() const {
+  std::vector<Node> leafNodes;
+  for (auto entry : vertices_) {
+    if (nodeOutDegree_.at(entry) == 0) {
+      leafNodes.push_back(entry);
+    }
+  }
+  return leafNodes;
+}
+
+size_t hipGraph::GetLeafNodeCount() const {
+  int numLeafNodes = 0;
+  for (auto entry : vertices_) {
+    if (nodeOutDegree_.at(entry) == 0) {
+      numLeafNodes++;
+    }
+  }
+  return numLeafNodes;
+}
+
+std::vector<std::pair<Node, Node>> hipGraph::GetEdges() const {
+  std::vector<std::pair<Node, Node>> edges;
+  for (const auto& i : edges_) {
+    for (const auto& j : i.second) {
+      edges.push_back(std::make_pair(i.first, j));
+    }
+  }
+  return edges;
+}
+
+void hipGraph::GetRunListUtil(Node v, std::unordered_map<Node, bool>& visited,
+                              std::vector<Node>& singleList,
+                              std::vector<std::vector<Node>>& parallelLists,
+                              std::unordered_map<Node, std::vector<Node>>& dependencies) {
+  // Mark the current node as visited.
+  visited[v] = true;
+  singleList.push_back(v);
+  // Recurse for all the vertices adjacent to this vertex
+  for (auto& adjNode : edges_[v]) {
+    if (!visited[adjNode]) {
+      // For the parallel list nodes add parent as the dependency
+      if (singleList.empty()) {
+        ClPrint(amd::LOG_INFO, amd::LOG_CODE,
+                "[hipGraph] For %s(%p)- add parent as dependency %s(%p)\n",
+                GetGraphNodeTypeString(adjNode->GetType()), adjNode,
+                GetGraphNodeTypeString(v->GetType()), v);
+        dependencies[adjNode].push_back(v);
+      }
+      GetRunListUtil(adjNode, visited, singleList, parallelLists, dependencies);
+    } else {
+      for (auto& list : parallelLists) {
+        // Merge singleList when adjNode matches with the first element of the list in existing
+        // lists
+        if (adjNode == list[0]) {
+          for (auto k = singleList.rbegin(); k != singleList.rend(); ++k) {
+            list.insert(list.begin(), *k);
+          }
+          singleList.erase(singleList.begin(), singleList.end());
+        }
+      }
+      // If the list cannot be merged with the existing list add as dependancy
+      if (!singleList.empty()) {
+        ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] For %s(%p)- add dependency %s(%p)\n",
+                GetGraphNodeTypeString(adjNode->GetType()), adjNode,
+                GetGraphNodeTypeString(v->GetType()), v);
+        dependencies[adjNode].push_back(v);
+      }
+    }
+  }
+  if (!singleList.empty()) {
+    parallelLists.push_back(singleList);
+    singleList.erase(singleList.begin(), singleList.end());
+  }
+}
+// The function to do Topological Sort.
+// It uses recursive GetRunListUtil()
+void hipGraph::GetRunList(std::vector<std::vector<Node>>& parallelLists,
+                          std::unordered_map<Node, std::vector<Node>>& dependencies) {
+  std::vector<Node> singleList;
+
+  // Mark all the vertices as not visited
+  std::unordered_map<Node, bool> visited;
+  for (auto node : vertices_) visited[node] = false;
+
+  // Call the recursive helper function for all vertices one by one
+  for (auto node : vertices_) {
+    if (visited[node] == false) {
+      GetRunListUtil(node, visited, singleList, parallelLists, dependencies);
+    }
+  }
+  for (size_t i = 0; i < parallelLists.size(); i++) {
+    for (size_t j = 0; j < parallelLists[i].size(); j++) {
+      ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] list %d - %s(%p)\n", i + 1,
+              GetGraphNodeTypeString(parallelLists[i][j]->GetType()), parallelLists[i][j]);
+    }
+  }
+}
+
+hipError_t hipGraph::LevelOrder(std::vector<Node>& levelOrder) {
+  std::vector<Node> roots = GetRootNodes();
+  std::unordered_map<Node, bool> visited;
+  std::queue<Node> q;
+  for (auto it = roots.begin(); it != roots.end(); it++) {
+    q.push(*it);
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] %s(%p) level:%d \n",
+            GetGraphNodeTypeString((*it)->GetType()), *it, (*it)->GetLevel());
+  }
+  while (!q.empty()) {
+    Node& node = q.front();
+    q.pop();
+    levelOrder.push_back(node);
+    for (const auto& i : edges_[node]) {
+      if (visited.find(i) == visited.end() && i->GetLevel() == (node->GetLevel() + 1)) {
+        q.push(i);
+        ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] %s(%p) level:%d \n",
+                GetGraphNodeTypeString(i->GetType()), i, i->GetLevel());
+        visited[i] = true;
+      }
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t hipGraphExec::CreateQueues() {
+  parallelQueues_.reserve(parallelLists_.size());
+  for (size_t i = 0; i < parallelLists_.size(); i++) {
+    amd::HostQueue* queue;
+    cl_command_queue_properties properties =
+        (callbacks_table.is_enabled() || HIP_FORCE_QUEUE_PROFILING) ? CL_QUEUE_PROFILING_ENABLE : 0;
+    queue = new amd::HostQueue(*hip::getCurrentDevice()->asContext(),
+                               *hip::getCurrentDevice()->devices()[0], properties);
+
+    bool result = (queue != nullptr) ? queue->create() : false;
+    // Create a host queue
+    if (result) {
+      parallelQueues_.push_back(queue);
+    } else {
+      ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create host queue\n");
+      return hipErrorOutOfMemory;
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t hipGraphExec::FillCommands() {
+  // Create commands
+  int i = 0;
+  hipError_t status;
+  for (const auto& list : parallelLists_) {
+    for (auto& node : list) {
+      status = node->CreateCommand(parallelQueues_[i]);
+      if (status != hipSuccess) return status;
+    }
+    i++;
+  }
+  // Add waitlists for all the commands
+  for (auto& node : levelOrder_) {
+    auto nodeWaitList = nodeWaitLists_.find(node);
+    if (nodeWaitList != nodeWaitLists_.end()) {
+      amd::Command::EventWaitList waitList;
+      for (auto depNode : nodeWaitList->second) {
+        for (auto command : depNode->GetCommands()) {
+          waitList.push_back(command);
+        }
+      }
+      for (auto command : nodeWaitList->first->GetCommands()) {
+        command->updateEventWaitList(waitList);
+      }
+    }
+  }
+  return status;
+}
+
+
+hipError_t hipGraphExec::Init() {
+  hipError_t status;
+  status = CreateQueues();
+  if (status != hipSuccess) {
+    return status;
+  }
+  status = FillCommands();
+  if (status != hipSuccess) {
+    return status;
+  }
+  rootCommand_ = nullptr;
+  /// stream should execute next command after graph finishes
+  /// Add marker to the stream that waits for all the last commands in parallel queues of graph
+  for (auto& singleList : parallelLists_) {
+    graphLastCmdWaitList_.push_back(singleList.back()->GetCommands().back());
+  }
+  return status;
+}
+
+void hipGraphExec::ResetGraph(cl_event event, cl_int command_exec_status, void* user_data) {
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Inside resetGraph!\n");
+  hipGraphExec_t graphExec =
+      hipGraphExec::activeGraphExec_[reinterpret_cast<amd::Command*>(user_data)];
+  if (graphExec != nullptr) {
+    for (auto& node : graphExec->levelOrder_) {
+      for (auto& command : node->GetCommands()) {
+        command->resetStatus(CL_INT_MAX);
+      }
+    }
+    graphExec->rootCommand_->resetStatus(CL_INT_MAX);
+    graphExec->bExecPending_.store(false);
+  } else {
+    ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] graphExec is nullptr during resetGraph!\n");
+  }
+}
+
+hipError_t hipGraphExec::UpdateGraphToWaitOnRoot() {
+  for (auto& singleList : parallelLists_) {
+    amd::Command::EventWaitList waitList;
+    waitList.push_back(rootCommand_);
+    if (!singleList.empty()) {
+      auto commands = singleList[0]->GetCommands();
+      if (!commands.empty()) {
+        commands[0]->updateEventWaitList(waitList);
+      }
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t hipGraphExec::Run(hipStream_t stream) {
+  if (bExecPending_.load() == true) {
+    ClPrint(
+        amd::LOG_INFO, amd::LOG_CODE,
+        "[hipGraph] Same graph launched while previous one is active, wait for it to finish!\n");
+    lastEnqueuedGraphCmd_->awaitCompletion();
+  }
+  amd::HostQueue* queue = hip::getQueue(stream);
+  if (queue == nullptr) {
+    return hipErrorInvalidResourceHandle;
+  }
+  if (rootCommand_ == nullptr || rootCommand_->queue() != queue) {
+    if (rootCommand_ != nullptr) {
+      rootCommand_->release();
+    }
+    rootCommand_ = new amd::Marker(*queue, false, {});
+    UpdateGraphToWaitOnRoot();
+  }
+  rootCommand_->enqueue();
+  for (auto& node : levelOrder_) {
+    for (auto& command : node->GetCommands()) {
+      command->enqueue();
+    }
+  }
+
+  amd::Command* command = new amd::Marker(*queue, false, graphLastCmdWaitList_);
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  amd::Event& event = command->event();
+  if (!event.setCallback(CL_COMPLETE, hipGraphExec::ResetGraph, command)) {
+    return hipErrorInvalidHandle;
+  }
+  hipGraphExec::activeGraphExec_[command] = this;
+  lastEnqueuedGraphCmd_ = command;
+  bExecPending_.store(true);
+  command->enqueue();
+  command->release();
+  return hipSuccess;
+}
\ No newline at end of file
diff --git a/rocclr/hip_graph_internal.hpp b/rocclr/hip_graph_internal.hpp
new file mode 100644
index 0000000000..8c034176f7
--- /dev/null
+++ b/rocclr/hip_graph_internal.hpp
@@ -0,0 +1,355 @@
+/* Copyright (c) 2021-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#pragma once
+#include <algorithm>
+#include <queue>
+#include <stack>
+#include <iostream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_graph_helper.hpp"
+
+typedef hipGraphNode* Node;
+
+class hipGraphNode {
+ protected:
+  uint32_t level_;
+  hipGraphNodeType type_;
+  std::vector<amd::Command*> commands_;
+  bool visited_;
+
+ public:
+  hipGraphNode(hipGraphNodeType type) {
+    type_ = type;
+    level_ = 0;
+    visited_ = false;
+  }
+  virtual ~hipGraphNode() {
+    for (auto command : commands_) {
+      delete command;
+    }
+  }
+  virtual hipError_t CreateCommand(amd::HostQueue* queue) { return hipSuccess; }
+  std::vector<amd::Command*>& GetCommands() { return commands_; }
+  hipGraphNodeType GetType() { return type_; }
+  uint32_t GetLevel() { return level_; }
+  void SetLevel(uint32_t level) { level_ = level; }
+};
+
+class hipGraph {
+  std::unordered_map<Node, size_t> nodeInDegree_;   // count of in coming edges for every vertex
+  std::unordered_map<Node, size_t> nodeOutDegree_;  // count of outgoing edges for every vertex
+  std::vector<Node> vertices_;
+  std::unordered_map<Node, std::vector<Node>> edges_;
+
+ public:
+  hipGraph() {}
+  ~hipGraph(){};
+  /// add node to the graph
+  hipError_t AddNode(const Node& node);
+  /// add edge to the graph
+  hipError_t AddEdge(const Node& parentNode, const Node& childNode);
+  /// Returns root nodes, all vertices with 0 in-degrees
+  std::vector<Node> GetRootNodes() const;
+  /// Returns leaf nodes, all vertices with 0 out-degrees
+  std::vector<Node> GetLeafNodes() const;
+  /// Returns number of leaf nodes
+  size_t GetLeafNodeCount() const;
+  /// Returns total numbers of nodes in the graph
+  size_t GetNodeCount() const { return vertices_.size(); }
+  /// returns all the nodes in the graph
+  std::vector<Node> GetNodes() const { return vertices_; }
+  /// returns all the edges in the graph
+  std::vector<std::pair<Node, Node>> GetEdges() const;
+  void GetRunListUtil(Node v, std::unordered_map<Node, bool>& visited,
+                      std::vector<Node>& singleList, std::vector<std::vector<Node>>& parallelList,
+                      std::unordered_map<Node, std::vector<Node>>& dependencies);
+  void GetRunList(std::vector<std::vector<Node>>& parallelList,
+                  std::unordered_map<Node, std::vector<Node>>& dependencies);
+  hipError_t LevelOrder(std::vector<Node>& levelOrder);
+};
+
+class hipGraphKernelNode : public hipGraphNode {
+  hipKernelNodeParams* pKernelParams_;
+  hipFunction_t func_;
+
+ public:
+  hipGraphKernelNode(const hipKernelNodeParams* pNodeParams, const hipFunction_t func)
+      : hipGraphNode(hipGraphNodeTypeKernel) {
+    pKernelParams_ = new hipKernelNodeParams(*pNodeParams);
+    func_ = func;
+  }
+  ~hipGraphKernelNode() { delete pKernelParams_; }
+
+  hipError_t CreateCommand(amd::HostQueue* queue) {
+    commands_.reserve(1);
+    amd::Command* command;
+    hipError_t status = ihipLaunchKernelCommand(
+        command, func_, pKernelParams_->gridDim.x * pKernelParams_->blockDim.x,
+        pKernelParams_->gridDim.y * pKernelParams_->blockDim.y,
+        pKernelParams_->gridDim.z * pKernelParams_->blockDim.z, pKernelParams_->blockDim.x,
+        pKernelParams_->blockDim.y, pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes,
+        queue, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0,
+        0, 0);
+    commands_.emplace_back(command);
+    return status;
+  }
+
+  void GetParams(hipKernelNodeParams* params) {
+    std::memcpy(params, pKernelParams_, sizeof(hipKernelNodeParams));
+  }
+  void SetParams(hipKernelNodeParams* params) {
+    std::memcpy(pKernelParams_, params, sizeof(hipKernelNodeParams));
+  }
+};
+
+class hipGraphMemcpyNode : public hipGraphNode {
+  hipMemcpy3DParms* pCopyParams_;
+
+ public:
+  hipGraphMemcpyNode(const hipMemcpy3DParms* pCopyParams) : hipGraphNode(hipGraphNodeTypeMemcpy) {
+    pCopyParams_ = new hipMemcpy3DParms(*pCopyParams);
+  }
+  ~hipGraphMemcpyNode() { delete pCopyParams_; }
+
+  hipError_t CreateCommand(amd::HostQueue* queue) {
+    commands_.reserve(1);
+    amd::Command* command;
+    hipError_t status = ihipMemcpy3DCommand(command, pCopyParams_, queue);
+    commands_.emplace_back(command);
+    return status;
+  }
+
+  void GetParams(hipMemcpy3DParms* params) {
+    std::memcpy(params, pCopyParams_, sizeof(hipMemcpy3DParms));
+  }
+  void SetParams(hipMemcpy3DParms* params) {
+    std::memcpy(pCopyParams_, params, sizeof(hipMemcpy3DParms));
+  }
+};
+
+
+class hipGraphMemcpyNode1D : public hipGraphNode {
+  void* dst_;
+  const void* src_;
+  size_t count_;
+  hipMemcpyKind kind_;
+
+ public:
+  hipGraphMemcpyNode1D(void* dst, const void* src, size_t count, hipMemcpyKind kind)
+      : hipGraphNode(hipGraphNodeTypeMemcpy1D), dst_(dst), src_(src), count_(count), kind_(kind) {}
+  ~hipGraphMemcpyNode1D() {}
+
+  hipError_t CreateCommand(amd::HostQueue* queue) {
+    commands_.reserve(1);
+    amd::Command* command = nullptr;
+    hipError_t status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *queue);
+    commands_.emplace_back(command);
+    return status;
+  }
+
+  void SetParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) {
+    dst_ = dst;
+    src_ = src;
+    count_ = count;
+    kind_ = kind;
+  }
+};
+
+template <class T> class hipGraphMemcpyNodeFromSymbol : public hipGraphNode {
+  void* dst_;
+  const T& symbol_;
+  size_t count_;
+  size_t offset_;
+  hipMemcpyKind kind_;
+
+ public:
+  hipGraphMemcpyNodeFromSymbol(void* dst, const void* symbol, size_t count, size_t offset,
+                               hipMemcpyKind kind)
+      : hipGraphNode(hipGraphNodeTypeMemcpyFromSymbol),
+        dst_(dst),
+        symbol_(symbol),
+        count_(count),
+        offset_(offset),
+        kind_(kind) {}
+  ~hipGraphMemcpyNodeFromSymbol() {}
+
+  hipError_t CreateCommand(amd::HostQueue* queue);
+
+  void SetParams(void* dst, const void* symbol, size_t count, size_t offset, hipMemcpyKind kind) {
+    dst_ = dst;
+    symbol_ = symbol;
+    count_ = count;
+    offset_ = offset;
+    kind_ = kind;
+  }
+};
+
+template <class T> class hipGraphMemcpyNodeToSymbol : public hipGraphNode {
+  const T& symbol_;
+  const void* src_;
+  size_t count_;
+  size_t offset_;
+  hipMemcpyKind kind_;
+
+ public:
+  hipGraphMemcpyNodeToSymbol(const T& symbol, void* src, size_t count, size_t offset,
+                             hipMemcpyKind kind)
+      : hipGraphNode(hipGraphNodeTypeMemcpyToSymbol),
+        symbol_(symbol),
+        src_(src),
+        count_(count),
+        offset_(offset),
+        kind_(kind) {}
+  ~hipGraphMemcpyNodeToSymbol() {}
+
+  hipError_t CreateCommand(amd::HostQueue* queue);
+
+  void SetParams(const T& symbol, void* src, size_t count, size_t offset, hipMemcpyKind kind) {
+    symbol_ = symbol;
+    src_ = src;
+    count_ = count;
+    offset_ = offset;
+    kind_ = kind;
+  }
+};
+
+class hipGraphMemsetNode : public hipGraphNode {
+  hipMemsetParams* pMemsetParams_;
+
+ public:
+  hipGraphMemsetNode(const hipMemsetParams* pMemsetParams) : hipGraphNode(hipGraphNodeTypeMemset) {
+    pMemsetParams_ = new hipMemsetParams(*pMemsetParams);
+  }
+  ~hipGraphMemsetNode() { delete pMemsetParams_; }
+
+  hipError_t CreateCommand(amd::HostQueue* queue) {
+    if (pMemsetParams_->height == 1) {
+      return ihipMemsetCommand(commands_, pMemsetParams_->dst, pMemsetParams_->value,
+                               pMemsetParams_->elementSize,
+                               pMemsetParams_->width * pMemsetParams_->elementSize, queue);
+    } else {
+      return ihipMemset3DCommand(commands_,
+                                 {pMemsetParams_->dst, pMemsetParams_->pitch, pMemsetParams_->width,
+                                  pMemsetParams_->height},
+                                 pMemsetParams_->elementSize,
+                                 {pMemsetParams_->width, pMemsetParams_->height, 1}, queue);
+    }
+    return hipSuccess;
+  }
+
+  void GetParams(hipMemsetParams* params) {
+    std::memcpy(params, pMemsetParams_, sizeof(hipMemsetParams));
+  }
+  void SetParams(hipMemsetParams* params) {
+    std::memcpy(pMemsetParams_, params, sizeof(hipMemsetParams));
+  }
+};
+
+class hipGraphEventRecordNode : public hipGraphNode {
+  hipEvent_t event_;
+
+ public:
+  hipGraphEventRecordNode(hipEvent_t event)
+      : hipGraphNode(hipGraphNodeTypeEventRecord), event_(event) {}
+  ~hipGraphEventRecordNode() {}
+
+  hipError_t CreateCommand(amd::HostQueue* queue);
+
+  void GetParams(hipEvent_t* event) { *event = event_; }
+  void SetParams(hipEvent_t event) { event_ = event; }
+};
+
+class hipGraphEventWaitNode : public hipGraphNode {
+  hipEvent_t event_;
+
+ public:
+  hipGraphEventWaitNode(hipEvent_t event)
+      : hipGraphNode(hipGraphNodeTypeWaitEvent), event_(event) {}
+  ~hipGraphEventWaitNode() {}
+
+  hipError_t CreateCommand(amd::HostQueue* queue);
+
+  void GetParams(hipEvent_t* event) { *event = event_; }
+  void SetParams(hipEvent_t event) { event_ = event; }
+};
+
+class hipGraphHostNode : public hipGraphNode {
+  hipHostNodeParams* pNodeParams_;
+
+ public:
+  hipGraphHostNode(const hipHostNodeParams* pNodeParams) : hipGraphNode(hipGraphNodeTypeHost) {
+    pNodeParams_ = new hipHostNodeParams(*pNodeParams);
+  }
+  ~hipGraphHostNode() { delete pNodeParams_; }
+
+  hipError_t CreateCommand(amd::HostQueue* queue);
+
+  void GetParams(hipHostNodeParams* params) {
+    std::memcpy(params, pNodeParams_, sizeof(hipHostNodeParams));
+  }
+  void SetParams(hipHostNodeParams* params) {
+    std::memcpy(pNodeParams_, params, sizeof(hipHostNodeParams));
+  }
+};
+
+class hipGraphExec {
+  std::vector<std::vector<Node>> parallelLists_;
+  std::vector<Node> levelOrder_;
+  std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
+  std::vector<amd::HostQueue*> parallelQueues_;
+  static std::unordered_map<amd::Command*, hipGraphExec_t> activeGraphExec_;
+  amd::Command::EventWaitList graphLastCmdWaitList_;
+  amd::Command* lastEnqueuedGraphCmd_;
+  std::atomic<bool> bExecPending_;
+  amd::Command* rootCommand_;
+
+ public:
+  hipGraphExec(std::vector<Node>& levelOrder, std::vector<std::vector<Node>>& lists,
+               std::unordered_map<Node, std::vector<Node>>& nodeWaitLists)
+      : parallelLists_(lists),
+        levelOrder_(levelOrder),
+        nodeWaitLists_(nodeWaitLists),
+        lastEnqueuedGraphCmd_(nullptr),
+        rootCommand_(nullptr) {
+    bExecPending_.store(false);
+  }
+
+  ~hipGraphExec() {
+    for (auto queue : parallelQueues_) {
+      queue->release();
+    }
+    for (auto node : levelOrder_) {
+      delete node;
+    }
+  }
+
+  hipError_t CreateQueues();
+  hipError_t FillCommands();
+  hipError_t Init();
+  hipError_t UpdateGraphToWaitOnRoot();
+  hipError_t Run(hipStream_t stream);
+  static void ResetGraph(cl_event event, cl_int command_exec_status, void* user_data);
+};
diff --git a/rocclr/hip_hcc.def.in b/rocclr/hip_hcc.def.in
new file mode 100755
index 0000000000..559a08289d
--- /dev/null
+++ b/rocclr/hip_hcc.def.in
@@ -0,0 +1,293 @@
+EXPORTS
+hipChooseDevice
+hipCtxCreate
+hipCtxDestroy
+hipCtxDisablePeerAccess
+hipCtxEnablePeerAccess
+hipCtxGetApiVersion
+hipCtxGetCacheConfig
+hipCtxGetCurrent
+hipCtxGetDevice
+hipCtxGetFlags
+hipCtxGetSharedMemConfig
+hipCtxPopCurrent
+hipCtxPushCurrent
+hipCtxSetCacheConfig
+hipCtxSetCurrent
+hipCtxSetSharedMemConfig
+hipCtxSynchronize
+hipDeviceCanAccessPeer
+hipDeviceComputeCapability
+hipDeviceDisablePeerAccess
+hipDeviceEnablePeerAccess
+hipDeviceGet
+hipDeviceGetAttribute
+hipDeviceGetByPCIBusId
+hipDeviceGetCacheConfig
+hipDeviceGetStreamPriorityRange
+hipDeviceGetLimit
+hipDeviceGetName
+hipDeviceGetPCIBusId
+hipDeviceGetSharedMemConfig
+hipDeviceGetP2PAttribute
+hipDevicePrimaryCtxGetState
+hipDevicePrimaryCtxRelease
+hipDevicePrimaryCtxReset
+hipDevicePrimaryCtxRetain
+hipDevicePrimaryCtxSetFlags
+hipDeviceReset
+hipDeviceSetCacheConfig
+hipDeviceSetSharedMemConfig
+hipDeviceSynchronize
+hipDeviceTotalMem
+hipDriverGetVersion
+hipEventCreate
+hipEventCreateWithFlags
+hipEventDestroy
+hipEventElapsedTime
+hipEventQuery
+hipEventRecord
+hipEventSynchronize
+hipExtGetLinkTypeAndHopCount
+hipExtLaunchMultiKernelMultiDevice
+hipExtMallocWithFlags
+hipExtModuleLaunchKernel
+hipExtLaunchKernel
+hipFree
+hipFreeArray
+hipFuncSetAttribute
+hipFuncSetCacheConfig
+hipFuncSetSharedMemConfig
+hipGetDevice
+hipGetDeviceCount
+hipGetDeviceProperties
+hipGetErrorName
+hipGetErrorString
+hipGetLastError
+hipMemAllocHost
+hipHostAlloc
+hipHostFree
+hipHostGetDevicePointer
+hipHostGetFlags
+hipHostMalloc
+hipHostRegister
+hipHostUnregister
+hipInit
+hipIpcCloseMemHandle
+hipIpcGetMemHandle
+hipIpcOpenMemHandle
+hipIpcGetEventHandle
+hipIpcOpenEventHandle
+hipMalloc
+hipMalloc3D
+hipMalloc3DArray
+hipMallocManaged
+hipArrayCreate
+hipArray3DCreate
+hipArrayDestroy
+hipMallocArray
+hipMemAdvise
+hipMemAllocPitch
+hipMallocPitch
+hipMemcpy
+hipMemcpyWithStream
+hipMemcpyParam2D
+hipMemcpy2D
+hipMemcpy2DAsync
+hipMemcpy2DToArray
+hipMemcpy2DToArrayAsync
+hipMemcpy3D
+hipMemcpy3DAsync
+hipDrvMemcpy3D
+hipDrvMemcpy3DAsync
+hipMemcpyAsync
+hipMemcpyDtoD
+hipMemcpyDtoDAsync
+hipMemcpyDtoH
+hipMemcpyDtoHAsync
+hipMemcpyFromSymbol
+hipMemcpyFromSymbolAsync
+hipMemcpyHtoD
+hipMemcpyHtoDAsync
+hipMemcpyPeer
+hipMemcpyPeerAsync
+hipMemcpyToArray
+hipMemcpyFromArray
+hipMemcpyToSymbol
+hipMemcpyToSymbolAsync
+hipMemGetAddressRange
+hipGetSymbolAddress
+hipGetSymbolSize
+hipMemGetInfo
+hipMemPrefetchAsync
+hipMemPtrGetInfo
+hipMemRangeGetAttribute
+hipMemRangeGetAttributes
+hipMemset
+hipMemsetAsync
+hipMemsetD8
+hipMemsetD8Async
+hipMemsetD16
+hipMemsetD16Async
+hipMemsetD32
+hipMemsetD32Async
+hipMemset2D
+hipMemset2DAsync
+hipMemset3D
+hipMemset3DAsync
+hipModuleGetFunction
+hipModuleGetGlobal
+hipModuleGetTexRef
+hipModuleLaunchKernel
+hipModuleLaunchKernelExt
+hipLaunchCooperativeKernel
+hipLaunchCooperativeKernelMultiDevice
+hipHccModuleLaunchKernel
+hipModuleLoad
+hipModuleLoadData
+hipModuleLoadDataEx
+hipModuleUnload
+hipModuleOccupancyMaxPotentialBlockSize
+hipModuleOccupancyMaxPotentialBlockSizeWithFlags
+hipModuleOccupancyMaxActiveBlocksPerMultiprocessor
+hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+hipOccupancyMaxPotentialBlockSize
+hipOccupancyMaxActiveBlocksPerMultiprocessor
+hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+hipFuncGetAttribute
+hipFuncGetAttributes
+hipPeekAtLastError
+hipPointerGetAttributes
+hipProfilerStart
+hipProfilerStop
+hipRuntimeGetVersion
+hipGetDeviceFlags
+hipSetDevice
+hipSetDeviceFlags
+hipStreamAddCallback
+hipStreamAttachMemAsync
+hipStreamCreate
+hipStreamCreateWithFlags
+hipStreamCreateWithPriority
+hipStreamDestroy
+hipStreamGetFlags
+hipStreamQuery
+hipStreamSynchronize
+hipStreamWaitEvent
+__hipPopCallConfiguration
+__hipPushCallConfiguration
+__hipRegisterFatBinary
+__hipRegisterFunction
+__hipRegisterVar
+__hipRegisterSurface
+__hipRegisterTexture
+__hipRegisterManagedVar
+__hipUnregisterFatBinary
+hipConfigureCall
+hipSetupArgument
+hipLaunchByPtr
+hipLaunchKernel
+hipRegisterApiCallback
+hipRemoveApiCallback
+hipRegisterActivityCallback
+hipRemoveActivityCallback
+hipApiName
+hipKernelNameRef
+hipBindTexture
+hipBindTexture2D
+hipBindTextureToArray
+hipBindTextureToMipmappedArray
+hipGetTextureAlignmentOffset
+hipGetTextureReference
+hipUnbindTexture
+hipCreateChannelDesc
+hipCreateTextureObject
+hipDestroyTextureObject
+hipGetChannelDesc
+hipGetTextureObjectResourceDesc
+hipGetTextureObjectResourceViewDesc
+hipGetTextureObjectTextureDesc
+hipTexRefGetAddress
+hipTexRefGetAddressMode
+hipTexRefGetArray
+hipTexRefGetBorderColor
+hipTexRefGetFilterMode
+hipTexRefGetFlags
+hipTexRefGetFormat
+hipTexRefGetMaxAnisotropy
+hipTexRefGetMipmapFilterMode
+hipTexRefGetMipmapLevelBias
+hipTexRefGetMipmapLevelClamp
+hipTexRefGetMipmappedArray
+hipTexRefSetAddress
+hipTexRefSetAddress2D
+hipTexRefSetAddressMode
+hipTexRefSetArray
+hipTexRefSetBorderColor
+hipTexRefSetFilterMode
+hipTexRefSetFlags
+hipTexRefSetFormat
+hipTexRefSetMaxAnisotropy
+hipTexRefSetMipmapFilterMode
+hipTexRefSetMipmapLevelBias
+hipTexRefSetMipmapLevelClamp
+hipTexRefSetMipmappedArray
+hipProfilerStart
+hipProfilerStop
+hipCreateSurfaceObject
+hipDestroySurfaceObject
+hipInitActivityCallback
+hipEnableActivityCallback
+hipGetCmdName
+hiprtcAddNameExpression
+hiprtcCompileProgram
+hiprtcCreateProgram
+hiprtcDestroyProgram
+hiprtcGetLoweredName
+hiprtcGetProgramLog
+hiprtcGetProgramLogSize
+hiprtcGetCode
+hiprtcGetCodeSize
+hiprtcGetErrorString
+hipMipmappedArrayCreate
+hipMallocMipmappedArray
+hipMipmappedArrayDestroy
+hipFreeMipmappedArray
+hipMipmappedArrayGetLevel
+hipGetMipmappedArrayLevel
+hipMallocHost
+hipFreeHost
+hipTexObjectCreate
+hipTexObjectDestroy
+hipTexObjectGetResourceDesc
+hipTexObjectGetResourceViewDesc
+hipTexObjectGetTextureDesc
+hipExtStreamCreateWithCUMask
+hipStreamGetPriority
+hipMemcpy2DFromArray
+hipMemcpy2DFromArrayAsync
+hipDrvMemcpy2DUnaligned
+hipMemcpyAtoH
+hipMemcpyHtoA
+hipMemcpyParam2DAsync
+__gnu_h2f_ieee
+__gnu_f2h_ieee
+hipExtStreamGetCUMask
+hipImportExternalMemory
+hipExternalMemoryGetMappedBuffer
+hipDestroyExternalMemory
+hipGraphCreate
+hipGraphDestroy
+hipGraphAddKernelNode
+hipGraphAddMemsetNode
+hipGraphAddMemcpyNode
+hipGraphInstantiate
+hipGraphLaunch
+hipStreamIsCapturing
+hipStreamBeginCapture
+hipStreamEndCapture
+hipGraphExecDestroy
+hipImportExternalSemaphore
+hipSignalExternalSemaphoresAsync
+hipWaitExternalSemaphoresAsync
+hipDestroyExternalSemaphore
diff --git a/rocclr/hip_hcc.map.in b/rocclr/hip_hcc.map.in
new file mode 100755
index 0000000000..e7d761d820
--- /dev/null
+++ b/rocclr/hip_hcc.map.in
@@ -0,0 +1,310 @@
+{
+global:
+    hipChooseDevice;
+    hipCtxCreate;
+    hipCtxDestroy;
+    hipCtxDisablePeerAccess;
+    hipCtxEnablePeerAccess;
+    hipCtxGetApiVersion;
+    hipCtxGetCacheConfig;
+    hipCtxGetCurrent;
+    hipCtxGetDevice;
+    hipCtxGetFlags;
+    hipCtxGetSharedMemConfig;
+    hipCtxPopCurrent;
+    hipCtxPushCurrent;
+    hipCtxSetCacheConfig;
+    hipCtxSetCurrent;
+    hipCtxSetSharedMemConfig;
+    hipCtxSynchronize;
+    hipDeviceCanAccessPeer;
+    hipDeviceComputeCapability;
+    hipDeviceDisablePeerAccess;
+    hipDeviceEnablePeerAccess;
+    hipDeviceGet;
+    hipDeviceGetAttribute;
+    hipDeviceGetByPCIBusId;
+    hipDeviceGetCacheConfig;
+    hipDeviceGetStreamPriorityRange;
+    hipDeviceGetLimit;
+    hipDeviceGetName;
+    hipDeviceGetPCIBusId;
+    hipDeviceGetSharedMemConfig;
+    hipDeviceGetP2PAttribute;
+    hipDevicePrimaryCtxGetState;
+    hipDevicePrimaryCtxRelease;
+    hipDevicePrimaryCtxReset;
+    hipDevicePrimaryCtxRetain;
+    hipDevicePrimaryCtxSetFlags;
+    hipDeviceReset;
+    hipDeviceSetCacheConfig;
+    hipDeviceSetSharedMemConfig;
+    hipDeviceSynchronize;
+    hipDeviceTotalMem;
+    hipDriverGetVersion;
+    hipEventCreate;
+    hipEventCreateWithFlags;
+    hipEventDestroy;
+    hipEventElapsedTime;
+    hipEventQuery;
+    hipEventRecord;
+    hipEventSynchronize;
+    hipExtGetLinkTypeAndHopCount;
+    hipExtLaunchMultiKernelMultiDevice;
+    hipExtMallocWithFlags;
+    hipExtModuleLaunchKernel;
+    hipExtLaunchKernel;
+    hipFree;
+    hipFreeArray;
+    hipFuncSetAttribute;
+    hipFuncSetCacheConfig;
+    hipFuncSetSharedMemConfig;
+    hipGetDevice;
+    hipGetDeviceCount;
+    hipGetDeviceProperties;
+    hipGetErrorName;
+    hipGetErrorString;
+    hipGetLastError;
+    hipMemAdvise;
+    hipMemAllocHost;
+    hipHostAlloc;
+    hipHostFree;
+    hipHostGetDevicePointer;
+    hipHostGetFlags;
+    hipHostMalloc;
+    hipHostRegister;
+    hipHostUnregister;
+    hipInit;
+    hipIpcCloseMemHandle;
+    hipIpcGetMemHandle;
+    hipIpcOpenMemHandle;
+    hipIpcGetEventHandle;
+    hipIpcOpenEventHandle;
+    hipMalloc;
+    hipMalloc3D;
+    hipMalloc3DArray;
+    hipMallocManaged;
+    hipArrayCreate;
+    hipArrayDestroy;
+    hipArray3DCreate;
+    hipMallocArray;
+    hipMallocPitch;
+    hipMemAllocPitch;
+    hipMemcpy;
+    hipMemcpyWithStream;
+    hipMemcpyParam2D;
+    hipMemcpy2D;
+    hipMemcpy2DAsync;
+    hipMemcpy2DToArray;
+    hipMemcpy2DToArrayAsync;
+    hipDrvMemcpy2DUnaligned;
+    hipMemcpy3D;
+    hipMemcpy3DAsync;
+    hipDrvMemcpy3D;
+    hipDrvMemcpy3DAsync;
+    hipMemcpyAsync;
+    hipMemcpyDtoD;
+    hipMemcpyDtoDAsync;
+    hipMemcpyDtoH;
+    hipMemcpyDtoHAsync;
+    hipMemcpyFromSymbol;
+    hipMemcpyFromSymbolAsync;
+    hipMemcpyHtoD;
+    hipMemcpyHtoDAsync;
+    hipMemcpyPeer;
+    hipMemcpyPeerAsync;
+    hipMemcpyToArray;
+    hipMemcpyFromArray;
+    hipMemcpyToSymbol;
+    hipMemcpyToSymbolAsync;
+    hipMemGetAddressRange;
+    hipGetSymbolAddress;
+    hipGetSymbolSize;
+    hipMemGetInfo;
+    hipMemPrefetchAsync;
+    hipMemPtrGetInfo;
+    hipMemRangeGetAttribute;
+    hipMemRangeGetAttributes;
+    hipMemset;
+    hipMemsetAsync;
+    hipMemsetD8;
+    hipMemsetD8Async;
+    hipMemsetD16;
+    hipMemsetD16Async;
+    hipMemsetD32;
+    hipMemsetD32Async;
+    hipMemset2D;
+    hipMemset2DAsync;
+    hipMemset3D;
+    hipMemset3DAsync;
+    hipModuleGetFunction;
+    hipModuleGetGlobal;
+    hipModuleGetTexRef;
+    hipModuleLaunchKernel;
+    hipModuleLaunchKernelExt;
+    hipLaunchCooperativeKernel;
+    hipLaunchCooperativeKernelMultiDevice;
+    hipModuleLoad;
+    hipModuleLoadData;
+    hipModuleLoadDataEx;
+    hipModuleUnload;
+    hipModuleOccupancyMaxPotentialBlockSize;
+    hipModuleOccupancyMaxPotentialBlockSizeWithFlags;
+    hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+    hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+    hipOccupancyMaxPotentialBlockSize;
+    hipOccupancyMaxActiveBlocksPerMultiprocessor;
+    hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+    hipFuncGetAttribute;
+    hipFuncGetAttributes;
+    hipPeekAtLastError;
+    hipPointerGetAttributes;
+    hipProfilerStart;
+    hipProfilerStop;
+    hipRuntimeGetVersion;
+    hipGetDeviceFlags;
+    hipSetDevice;
+    hipSetDeviceFlags;
+    hipStreamAddCallback;
+    hipStreamAttachMemAsync;
+    hipStreamCreate;
+    hipStreamCreateWithFlags;
+    hipStreamCreateWithPriority;
+    hipStreamDestroy;
+    hipStreamGetFlags;
+    hipStreamQuery;
+    hipStreamSynchronize;
+    hipStreamWaitEvent;
+    hipStreamWaitValue32;
+    hipStreamWaitValue64;
+    hipStreamWriteValue32;
+    hipStreamWriteValue64;
+    __hipPopCallConfiguration;
+    __hipPushCallConfiguration;
+    __hipRegisterFatBinary;
+    __hipRegisterFunction;
+    __hipRegisterVar;
+    __hipRegisterSurface;
+    __hipRegisterTexture;
+    __hipRegisterManagedVar;
+    __hipUnregisterFatBinary;
+    __gnu_h2f_ieee;
+    __gnu_f2h_ieee;
+    hipConfigureCall;
+    hipSetupArgument;
+    hipLaunchByPtr;
+    hipLaunchKernel;
+    hipRegisterApiCallback;
+    hipRemoveApiCallback;
+    hipRegisterActivityCallback;
+    hipRemoveActivityCallback;
+    hipApiName;
+    hipKernelNameRef;
+    hipKernelNameRefByPtr;
+    hipGetStreamDeviceId;
+    hipProfilerStart;
+    hipProfilerStop;
+    hiprtcCompileProgram;
+    hiprtcCreateProgram;
+    hiprtcDestroyProgram;
+    hiprtcGetLoweredName;
+    hiprtcGetProgramLog;
+    hiprtcGetProgramLogSize;
+    hiprtcGetCode;
+    hiprtcGetCodeSize;
+    hiprtcGetErrorString;
+    hiprtcAddNameExpression;
+    hiprtcVersion;
+    hipBindTexture;
+    hipBindTexture2D;
+    hipBindTextureToArray;
+    hipBindTextureToMipmappedArray;
+    hipGetTextureAlignmentOffset;
+    hipGetTextureReference;
+    hipUnbindTexture;
+    hipCreateChannelDesc;
+    hipCreateTextureObject;
+    hipDestroyTextureObject;
+    hipGetChannelDesc;
+    hipGetTextureObjectResourceDesc;
+    hipGetTextureObjectResourceViewDesc;
+    hipGetTextureObjectTextureDesc;
+    hipTexRefGetAddress;
+    hipTexRefGetAddressMode;
+    hipTexRefGetArray;
+    hipTexRefGetBorderColor;
+    hipTexRefGetFilterMode;
+    hipTexRefGetFlags;
+    hipTexRefGetFormat;
+    hipTexRefGetMaxAnisotropy;
+    hipTexRefGetMipmapFilterMode;
+    hipTexRefGetMipmapLevelBias;
+    hipTexRefGetMipmapLevelClamp;
+    hipTexRefGetMipmappedArray;
+    hipTexRefSetAddress;
+    hipTexRefSetAddress2D;
+    hipTexRefSetAddressMode;
+    hipTexRefSetArray;
+    hipTexRefSetBorderColor;
+    hipTexRefSetFilterMode;
+    hipTexRefSetFlags;
+    hipTexRefSetFormat;
+    hipTexRefSetMaxAnisotropy;
+    hipTexRefSetMipmapFilterMode;
+    hipTexRefSetMipmapLevelBias;
+    hipTexRefSetMipmapLevelClamp;
+    hipTexRefSetMipmappedArray;
+    hipMipmappedArrayCreate;
+    hipMallocMipmappedArray;
+    hipMipmappedArrayDestroy;
+    hipFreeMipmappedArray;
+    hipMipmappedArrayGetLevel;
+    hipGetMipmappedArrayLevel;
+    hipMallocHost;
+    hipFreeHost;
+    hipTexObjectCreate;
+    hipTexObjectDestroy;
+    hipTexObjectGetResourceDesc;
+    hipTexObjectGetResourceViewDesc;
+    hipTexObjectGetTextureDesc;
+    hipImportExternalMemory;
+    hipExternalMemoryGetMappedBuffer;
+    hipDestroyExternalMemory;
+    hipGraphCreate;
+    hipGraphDestroy;
+    hipGraphAddKernelNode;
+    hipGraphAddMemsetNode;
+    hipGraphAddMemcpyNode;
+    hipGraphInstantiate;
+    hipGraphLaunch;
+    hipStreamIsCapturing;
+    hipStreamBeginCapture;
+    hipStreamEndCapture;
+    hipGraphExecDestroy;
+    hipImportExternalSemaphore;
+    hipSignalExternalSemaphoresAsync;
+    hipWaitExternalSemaphoresAsync;
+    hipDestroyExternalSemaphore;
+    extern "C++" {
+    hip_impl::hipLaunchKernelGGLImpl*;
+    hip_impl::demangle*;
+    hipCreateSurfaceObject*;
+    hipDestroySurfaceObject*;
+    hipHccModuleLaunchKernel*;
+    hipExtModuleLaunchKernel*;
+    hipInitActivityCallback*;
+    hipEnableActivityCallback*;
+    hipGetCmdName*;
+    hipExtStreamCreateWithCUMask;
+    hipStreamGetPriority;
+    hipMemcpy2DFromArray;
+    hipMemcpy2DFromArrayAsync;
+    hipMemcpyAtoH;
+    hipMemcpyHtoA;
+    hipMemcpyParam2DAsync;
+    __hipGetPCH;
+    hipExtStreamGetCUMask;
+    };
+local:
+    *;
+};
diff --git a/rocclr/hip_hcc.rc b/rocclr/hip_hcc.rc
new file mode 100644
index 0000000000..009dc30c18
--- /dev/null
+++ b/rocclr/hip_hcc.rc
@@ -0,0 +1,75 @@
+#define STR(__macro__) #__macro__
+#define XSTR(__macro__) STR(__macro__)
+
+#if defined(_DEBUG)
+#define DEBUG_ONLY(x) x
+#else
+#define DEBUG_ONLY(x)
+#endif
+
+#define VERSION_PREFIX_MAJOR 2
+#define VERSION_PREFIX_MINOR 0
+
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "winresrc.h"
+#include "utils/versions.hpp"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 10,0,AMD_PLATFORM_BUILD_NUMBER,AMD_PLATFORM_REVISION_NUMBER
+ PRODUCTVERSION 10,0,AMD_PLATFORM_BUILD_NUMBER,AMD_PLATFORM_REVISION_NUMBER
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904b0"
+        BEGIN
+            VALUE "Comments", " \0"
+            VALUE "CompanyName", "Advanced Micro Devices Inc.\0"
+            VALUE "FileDescription", AMD_PLATFORM_NAME " OpenCL " XSTR(VERSION_PREFIX_MAJOR) "." XSTR(VERSION_PREFIX_MINOR) " Runtime\0"
+            VALUE "FileVersion", "10.0." XSTR(AMD_PLATFORM_BUILD_NUMBER) "." XSTR(AMD_PLATFORM_REVISION_NUMBER)
+            VALUE "InternalName", "OpenCL"
+            VALUE "LegalCopyright", "Copyright (C) 2011 Advanced Micro Devices Inc.\0"
+            VALUE "OriginalFilename", "OpenCL.dll"
+            VALUE "ProductName", "OpenCL " XSTR(VERSION_PREFIX_MAJOR) "." XSTR(VERSION_PREFIX_MINOR) " " AMD_PLATFORM_INFO "\0"
+            VALUE "ProductVersion", "10.0." XSTR(AMD_PLATFORM_BUILD_NUMBER) "." XSTR(AMD_PLATFORM_REVISION_NUMBER)
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1200
+    END
+END
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
diff --git a/rocclr/hip_hmm.cpp b/rocclr/hip_hmm.cpp
new file mode 100644
index 0000000000..4278115b25
--- /dev/null
+++ b/rocclr/hip_hmm.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2020-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "hip_conversions.hpp"
+#include "platform/context.hpp"
+#include "platform/command.hpp"
+#include "platform/memory.hpp"
+
+// Forward declaraiton of a function
+hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
+
+// Make sure HIP defines match ROCclr to avoid double conversion
+static_assert(hipCpuDeviceId == amd::CpuDeviceId, "CPU device ID mismatch with ROCclr!");
+static_assert(hipInvalidDeviceId == amd::InvalidDeviceId,
+              "Invalid device ID mismatch with ROCclr!");
+
+static_assert(static_cast<uint32_t>(hipMemAdviseSetReadMostly) ==
+              amd::MemoryAdvice::SetReadMostly, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseUnsetReadMostly) ==
+              amd::MemoryAdvice::UnsetReadMostly, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseSetPreferredLocation) ==
+              amd::MemoryAdvice::SetPreferredLocation, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseUnsetPreferredLocation) ==
+              amd::MemoryAdvice::UnsetPreferredLocation, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseSetAccessedBy) ==
+              amd::MemoryAdvice::SetAccessedBy, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseUnsetAccessedBy) ==
+              amd::MemoryAdvice::UnsetAccessedBy, "Enum mismatch with ROCclr!");
+
+static_assert(static_cast<uint32_t>(hipMemRangeAttributeReadMostly) ==
+              amd::MemRangeAttribute::ReadMostly, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemRangeAttributePreferredLocation) ==
+              amd::MemRangeAttribute::PreferredLocation, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemRangeAttributeAccessedBy) ==
+              amd::MemRangeAttribute::AccessedBy, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemRangeAttributeLastPrefetchLocation) ==
+              amd::MemRangeAttribute::LastPrefetchLocation, "Enum mismatch with ROCclr!");
+
+// ================================================================================================
+hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
+  HIP_INIT_API(hipMallocManaged, dev_ptr, size, flags);
+
+  if ((dev_ptr == nullptr) || (size == 0) ||
+      ((flags != hipMemAttachGlobal) && (flags != hipMemAttachHost))) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(ihipMallocManaged(dev_ptr, size), *dev_ptr);
+}
+
+// ================================================================================================
+hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
+                               hipStream_t stream) {
+  HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream);
+
+  if ((dev_ptr == nullptr) || (count == 0)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::HostQueue* queue = nullptr;
+  bool cpu_access = (device == hipCpuDeviceId) ? true : false;
+
+  // Pick the specified stream or Null one from the provided device
+  if (stream != nullptr) {
+    queue = hip::getQueue(stream);
+  } else {
+    if (!cpu_access) {
+      queue = g_devices[device]->NullStream();
+    } else {
+      queue = hip::getCurrentDevice()->NullStream();
+    }
+  }
+  if (queue == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::Command::EventWaitList waitList;
+  amd::SvmPrefetchAsyncCommand* command =
+      new amd::SvmPrefetchAsyncCommand(*queue, waitList, dev_ptr, count, cpu_access);
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (!command->validateMemory()) {
+    delete command;
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  command->enqueue();
+  command->release();
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice, int device) {
+  HIP_INIT_API(hipMemAdvise, dev_ptr, count, advice, device);
+
+  if ((dev_ptr == nullptr) || (count == 0) ||
+      ((device != hipCpuDeviceId) && (static_cast<size_t>(device) >= g_devices.size()))) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::Device* dev = (device == hipCpuDeviceId) ?
+    g_devices[0]->devices()[0] : g_devices[device]->devices()[0];
+  bool use_cpu = (device == hipCpuDeviceId) ? true : false;
+
+  // Set the allocation attributes in AMD HMM
+  if (!dev->SetSvmAttributes(dev_ptr, count, static_cast<amd::MemoryAdvice>(advice), use_cpu)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttribute attribute,
+                                   const void* dev_ptr, size_t count) {
+  HIP_INIT_API(hipMemRangeGetAttribute, data, data_size, attribute, dev_ptr, count);
+
+  if ((data == nullptr) || (data_size == 0) || (dev_ptr == nullptr) || (count == 0)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Shouldn't matter for which device the interface is called
+  amd::Device* dev = g_devices[0]->devices()[0];
+
+  // Get the allocation attribute from AMD HMM
+  if (!dev->GetSvmAttributes(&data, &data_size, reinterpret_cast<int*>(&attribute), 1,
+                             dev_ptr, count)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
+                                    hipMemRangeAttribute* attributes, size_t num_attributes,
+                                    const void* dev_ptr, size_t count) {
+  HIP_INIT_API(hipMemRangeGetAttributes, data, data_sizes,
+               attributes, num_attributes, dev_ptr, count);
+
+  if ((data == nullptr) || (data_sizes == nullptr) || (attributes == nullptr) ||
+      (num_attributes == 0) || (dev_ptr == nullptr) || (count == 0)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Shouldn't matter for which device the interface is called
+  amd::Device* dev = g_devices[0]->devices()[0];
+  // Get the allocation attributes from AMD HMM
+  if (!dev->GetSvmAttributes(data, data_sizes, reinterpret_cast<int*>(attributes),
+      num_attributes, dev_ptr, count)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipStreamAttachMemAsync(hipStream_t stream, hipDeviceptr_t* dev_ptr,
+                                   size_t length, unsigned int flags) {
+  HIP_INIT_API(hipStreamAttachMemAsync, stream, dev_ptr, length, flags);
+
+  if ((stream == nullptr) || (dev_ptr == nullptr) || (length == 0)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Unclear what should be done for this interface in AMD HMM, since it's generic SVM alloc
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align) {
+  if (ptr == nullptr) {
+    return hipErrorInvalidValue;
+  } else if (size == 0) {
+    *ptr = nullptr;
+    return hipSuccess;
+  }
+
+  assert((hip::host_device->asContext()!= nullptr) && "Current host context must be valid");
+  amd::Context& ctx = *hip::host_device->asContext();
+
+  const amd::Device& dev = *ctx.devices()[0];
+
+  // For now limit to the max allocation size on the device.
+  // The apps should be able to go over the limit in the future
+  if (dev.info().maxMemAllocSize_ < size) {
+    return hipErrorMemoryAllocation;
+  }
+
+  // Allocate SVM fine grain buffer with the forced host pointer, avoiding explicit memory
+  // allocation in the device driver
+  *ptr = amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR,
+                                size, (align == 0) ? dev.info().memBaseAddrAlign_ : align);
+  if (*ptr == nullptr) {
+    return hipErrorMemoryAllocation;
+  }
+
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] ihipMallocManaged ptr=0x%zx",  getpid(),
+    std::this_thread::get_id(), *ptr);
+  return hipSuccess;
+}
diff --git a/rocclr/hip_intercept.cpp b/rocclr/hip_intercept.cpp
new file mode 100755
index 0000000000..6ff64a4bc8
--- /dev/null
+++ b/rocclr/hip_intercept.cpp
@@ -0,0 +1,81 @@
+/* Copyright (c) 2019-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_platform.hpp"
+#include "hip_prof_api.h"
+
+// HIP API callback/activity
+
+api_callbacks_table_t callbacks_table;
+
+extern const std::string& FunctionName(const hipFunction_t f);
+
+const char* hipKernelNameRef(const hipFunction_t f) { return FunctionName(f).c_str(); }
+
+int hipGetStreamDeviceId(hipStream_t stream) {
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  return (s != nullptr)? s->DeviceId() : ihipGetDevice();
+}
+
+const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream) {
+  if (hostFunction == NULL) {
+    return NULL;
+  }
+  int deviceId = hipGetStreamDeviceId(stream);
+  if (deviceId == -1) {
+    LogPrintfError("Wrong Device Id: %d \n", deviceId);
+    return NULL;
+  }
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
+  if (hip_error != hipSuccess) {
+    return NULL;
+  }
+  return hipKernelNameRef(func);
+}
+
+hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg) {
+  return callbacks_table.set_callback(id, reinterpret_cast<api_callbacks_table_t::fun_t>(fun), arg) ?
+    hipSuccess : hipErrorInvalidValue;
+}
+
+hipError_t hipRemoveApiCallback(uint32_t id) {
+  return callbacks_table.set_callback(id, NULL, NULL) ? hipSuccess : hipErrorInvalidValue;
+}
+
+hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg) {
+  return callbacks_table.set_activity(id, reinterpret_cast<api_callbacks_table_t::act_t>(fun), arg) ?
+    hipSuccess : hipErrorInvalidValue;
+}
+
+hipError_t hipRemoveActivityCallback(uint32_t id) {
+  return callbacks_table.set_activity(id, NULL, NULL) ? hipSuccess : hipErrorInvalidValue;
+}
+
+hipError_t hipEnableTracing(bool enabled) {
+  callbacks_table.set_enabled(enabled);
+  return hipSuccess;
+}
+
+const char* hipApiName(uint32_t id) {
+  return hip_api_name(id);
+}
diff --git a/rocclr/hip_internal.hpp b/rocclr/hip_internal.hpp
new file mode 100755
index 0000000000..577039ded2
--- /dev/null
+++ b/rocclr/hip_internal.hpp
@@ -0,0 +1,353 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef HIP_SRC_HIP_INTERNAL_H
+#define HIP_SRC_HIP_INTERNAL_H
+
+#include "vdi_common.hpp"
+#include "hip_prof_api.h"
+#include "trace_helper.h"
+#include "utils/debug.hpp"
+#include "hip_formatting.hpp"
+
+#include "hip_graph_capture.hpp"
+
+#include <unordered_set>
+#include <thread>
+#include <stack>
+#include <mutex>
+#include <iterator>
+#ifdef _WIN32
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
+/*! IHIP IPC MEMORY Structure */
+#define IHIP_IPC_MEM_HANDLE_SIZE   32
+#define IHIP_IPC_MEM_RESERVED_SIZE LP64_SWITCH(24,16)
+
+typedef struct ihipIpcMemHandle_st {
+  char ipc_handle[IHIP_IPC_MEM_HANDLE_SIZE];  ///< ipc memory handle on ROCr
+  size_t psize;
+  size_t poffset;
+  char reserved[IHIP_IPC_MEM_RESERVED_SIZE];
+} ihipIpcMemHandle_t;
+
+#define IHIP_IPC_EVENT_HANDLE_SIZE 32
+#define IHIP_IPC_EVENT_RESERVED_SIZE LP64_SWITCH(28,24)
+typedef struct ihipIpcEventHandle_st {
+    //hsa_amd_ipc_signal_t ipc_handle;  ///< ipc signal handle on ROCr
+    //char ipc_handle[IHIP_IPC_EVENT_HANDLE_SIZE];
+    //char reserved[IHIP_IPC_EVENT_RESERVED_SIZE];
+    char shmem_name[IHIP_IPC_EVENT_HANDLE_SIZE];
+}ihipIpcEventHandle_t;
+
+#ifdef _WIN32
+  inline int getpid() { return _getpid(); }
+#endif
+
+#define HIP_INIT() \
+  std::call_once(hip::g_ihipInitialized, hip::init);       \
+  if (hip::g_device == nullptr && g_devices.size() > 0) {  \
+    hip::g_device = g_devices[0];                          \
+  }
+
+#define HIP_API_PRINT(...)                                 \
+  uint64_t startTimeUs=0 ; HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s%s ( %s )%s",  getpid(), std::this_thread::get_id(), KGRN,    \
+          __func__, ToString( __VA_ARGS__ ).c_str(),KNRM);
+
+#define HIP_ERROR_PRINT(err, ...)                             \
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(),  \
+          __func__, hipGetErrorName(err), ToString( __VA_ARGS__ ).c_str());
+
+// This macro should be called at the beginning of every HIP API.
+#define HIP_INIT_API(cid, ...)                               \
+  HIP_API_PRINT(__VA_ARGS__)                                 \
+  amd::Thread* thread = amd::Thread::current();              \
+  if (!VDI_CHECK_THREAD(thread)) {                           \
+    HIP_RETURN(hipErrorOutOfMemory);                         \
+  }                                                          \
+  HIP_INIT()                                                 \
+  HIP_CB_SPAWNER_OBJECT(cid);
+
+#define HIP_RETURN_DURATION(ret, ...)                      \
+  hip::g_lastError = ret;                         \
+  HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s: Returned %s : %s",  getpid(), std::this_thread::get_id(),  \
+          __func__, hipGetErrorName(hip::g_lastError), ToString( __VA_ARGS__ ).c_str()); \
+  return hip::g_lastError;
+
+#define HIP_RETURN(ret, ...)                      \
+  hip::g_lastError = ret;                         \
+  HIP_ERROR_PRINT(hip::g_lastError, __VA_ARGS__)  \
+  return hip::g_lastError;
+
+#define HIP_RETURN_ONFAIL(func)          \
+  do {                                   \
+    hipError_t herror = (func);          \
+    if (herror != hipSuccess) {          \
+      HIP_RETURN(herror);                \
+    }                                    \
+  } while (0);
+
+// Cannot be use in place of HIP_RETURN.
+// Refrain from using for external HIP APIs
+#define IHIP_RETURN_ONFAIL(func)         \
+  do {                                   \
+    hipError_t herror = (func);          \
+    if (herror != hipSuccess) {          \
+      return herror;                     \
+    }                                    \
+  } while (0);
+
+#define STREAM_CAPTURE(name, stream, ...)                                                          \
+  if (stream != nullptr &&                                                                         \
+      reinterpret_cast<hip::Stream*>(stream)->GetCaptureStatus() ==                                \
+          hipStreamCaptureStatusActive) {                                                          \
+    hipError_t status = capture##name(stream, ##__VA_ARGS__);                                      \
+    HIP_RETURN(status);                                                                            \
+  }
+
+#define EVENT_CAPTURE(name, event, ...)                                                            \
+  if (event != nullptr && reinterpret_cast<hip::Event*>(event)->GetCaptureStatus() == true) {      \
+    hipError_t status = capture##name(event, ##__VA_ARGS__);                                       \
+    HIP_RETURN(status);                                                                            \
+  }
+
+namespace hc {
+class accelerator;
+class accelerator_view;
+};
+
+namespace hip {
+  class Device;
+
+  class Stream {
+  public:
+    enum Priority : int { High = -1, Normal = 0, Low = 1 };
+
+  private:
+    amd::HostQueue* queue_;
+    mutable amd::Monitor lock_;
+    Device* device_;
+    Priority priority_;
+    unsigned int flags_;
+    bool null_;
+    const std::vector<uint32_t> cuMask_;
+
+    /// Stream capture related parameters
+
+    /// Current capture status of the stream
+    hipStreamCaptureStatus captureStatus_;
+    /// Graph that is constructed with capture
+    hipGraph_t pCaptureGraph_;
+    /// Based on mode stream capture places restrictions on API calls that can be made within or
+    /// concurrently
+    hipStreamCaptureMode captureMode_;
+    bool originStream_;
+    /// Origin sream has no parent. Parent stream for the derived captured streams with event
+    /// dependencies
+    hipStream_t parentStream_;
+    /// Last graph node captured in the stream
+    std::vector<hipGraphNode_t> lastCapturedNodes_;
+    /// Derived streams/Paralell branches from the origin stream
+    std::vector<hipStream_t> parallelCaptureStreams_;
+    /// Capture events
+    std::vector<hipEvent_t> captureEvents_;
+
+  public:
+    Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false,
+           const std::vector<uint32_t>& cuMask = {},
+           hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone);
+    ~Stream();
+    /// Creates the hip stream object, including AMD host queue
+    bool Create();
+
+    /// Get device AMD host queue object. The method can allocate the queue
+    amd::HostQueue* asHostQueue(bool skip_alloc = false);
+
+    void Finish() const;
+    /// Get device ID associated with the current stream;
+    int DeviceId() const;
+    /// Get device ID associated with a stream;
+    static int DeviceId(const hipStream_t hStream);
+    /// Returns if stream is null stream
+    bool Null() const { return null_; }
+    /// Returns the lock object for the current stream
+    amd::Monitor& Lock() const { return lock_; }
+    /// Returns the creation flags for the current stream
+    unsigned int Flags() const { return flags_; }
+    /// Returns the priority for the current stream
+    Priority GetPriority() const { return priority_; }
+    /// Returns the CU mask for the current stream
+    const std::vector<uint32_t> GetCUMask() const { return cuMask_; }
+
+    /// Sync all non-blocking streams
+    static void syncNonBlockingStreams();
+
+    /// Returns capture status of the current stream
+    hipStreamCaptureStatus GetCaptureStatus() const { return captureStatus_; }
+    /// Returns capture mode of the current stream
+    hipStreamCaptureMode GetCaptureMode() const { return captureMode_; }
+    /// Returns if stream is origin stream
+    bool IsOriginStream() const { return originStream_; }
+    void SetOriginStream() { originStream_ = true; }
+    /// Returns captured graph
+    hipGraph_t GetCaptureGraph() const { return pCaptureGraph_; }
+    /// Returns last captured graph node
+    std::vector<hipGraphNode_t> GetLastCapturedNodes() const { return lastCapturedNodes_; }
+    /// Set last captured graph node
+    void SetLastCapturedNode(hipGraphNode_t graphNode) {
+      lastCapturedNodes_.clear();
+      lastCapturedNodes_.push_back(graphNode);
+    }
+    /// Append captured node via the wait event cross stream
+    void AddCrossCapturedNode(std::vector<hipGraphNode_t> graphNodes) {
+      for (auto node : graphNodes) {
+        lastCapturedNodes_.push_back(node);
+      }
+    }
+    /// Set graph that is being captured
+    void SetCaptureGraph(hipGraph_t pGraph) {
+      pCaptureGraph_ = pGraph;
+      captureStatus_ = hipStreamCaptureStatusActive;
+    }
+    /// reset capture parameters
+    hipError_t EndCapture();
+    /// Set capture status
+    void SetCaptureStatus(hipStreamCaptureStatus captureStatus) { captureStatus_ = captureStatus; }
+    /// Set capture mode
+    void SetCaptureMode(hipStreamCaptureMode captureMode) { captureMode_ = captureMode; }
+    /// Set parent stream
+    void SetParentStream(hipStream_t parentStream) { parentStream_ = parentStream; }
+    /// Get parent stream
+    hipStream_t GetParentStream() { return parentStream_; }
+  };
+
+  /// HIP Device class
+  class Device {
+    amd::Monitor lock_{"Device lock"};
+    /// ROCclr context
+    amd::Context* context_;
+    /// Device's ID
+    /// Store it here so we don't have to loop through the device list every time
+    int deviceId_;
+    /// ROCclr host queue for default streams
+    Stream null_stream_;
+    /// Store device flags
+    unsigned int flags_;
+    /// Maintain list of user enabled peers
+    std::list<int> userEnabledPeers;
+
+  public:
+    Device(amd::Context* ctx, int devId):
+      context_(ctx), deviceId_(devId), null_stream_(this, Stream::Priority::Normal, 0, true), flags_(hipDeviceScheduleSpin)
+        { assert(ctx != nullptr); }
+    ~Device() {}
+
+    amd::Context* asContext() const { return context_; }
+    int deviceId() const { return deviceId_; }
+    void retain() const { context_->retain(); }
+    void release() const { context_->release(); }
+    const std::vector<amd::Device*>& devices() const { return context_->devices(); }
+    hipError_t EnablePeerAccess(int peerDeviceId){
+      amd::ScopedLock lock(lock_);
+      bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
+      if (found) {
+        return hipErrorPeerAccessAlreadyEnabled;
+      }
+      userEnabledPeers.push_back(peerDeviceId);
+      return hipSuccess;
+    }
+    hipError_t DisablePeerAccess(int peerDeviceId) {
+      amd::ScopedLock lock(lock_);
+      bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
+      if (found) {
+        userEnabledPeers.remove(peerDeviceId);
+        return hipSuccess;
+      } else {
+        return hipErrorPeerAccessNotEnabled;
+      }
+    }
+    unsigned int getFlags() const { return flags_; }
+    void setFlags(unsigned int flags) { flags_ = flags; }
+    amd::HostQueue* NullStream(bool skip_alloc = false);
+  };
+
+  extern std::once_flag g_ihipInitialized;
+  /// Current thread's device
+  extern thread_local Device* g_device;
+  extern thread_local hipError_t g_lastError;
+  /// Device representing the host - for pinned memory
+  extern Device* host_device;
+
+  extern void init();
+
+  extern Device* getCurrentDevice();
+
+  extern void setCurrentDevice(unsigned int index);
+
+  /// Get ROCclr queue associated with hipStream
+  /// Note: This follows the CUDA spec to sync with default streams
+  ///       and Blocking streams
+  extern amd::HostQueue* getQueue(hipStream_t s);
+  /// Get default stream associated with the ROCclr context
+  extern amd::HostQueue* getNullStream(amd::Context&);
+  /// Get default stream of the thread
+  extern amd::HostQueue* getNullStream();
+  /// Check if stream is valid
+  extern bool isValid(hipStream_t stream);
+};
+
+struct ihipExec_t {
+  dim3 gridDim_;
+  dim3 blockDim_;
+  size_t sharedMem_;
+  hipStream_t hStream_;
+  std::vector<char> arguments_;
+};
+
+/// Wait all active streams on the blocking queue. The method enqueues a wait command and
+/// doesn't stall the current thread
+extern void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream = false);
+
+extern std::vector<hip::Device*> g_devices;
+extern hipError_t ihipDeviceGetCount(int* count);
+extern int ihipGetDevice();
+
+extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
+extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
+extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
+
+constexpr bool kOptionChangeable = true;
+constexpr bool kNewDevProg = false;
+
+constexpr bool kMarkerDisableFlush = true;   //!< Avoids command batch flush in ROCclr
+
+#endif // HIP_SRC_HIP_INTERNAL_H
diff --git a/rocclr/hip_memory.cpp b/rocclr/hip_memory.cpp
new file mode 100755
index 0000000000..0d49cb942c
--- /dev/null
+++ b/rocclr/hip_memory.cpp
@@ -0,0 +1,2852 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "hip_platform.hpp"
+#include "hip_conversions.hpp"
+#include "platform/context.hpp"
+#include "platform/command.hpp"
+#include "platform/memory.hpp"
+#include "amdocl/cl_vk_amd.hpp"
+
+// ================================================================================================
+amd::Memory* getMemoryObject(const void* ptr, size_t& offset) {
+  amd::Memory *memObj = amd::MemObjMap::FindMemObj(ptr);
+  if (memObj != nullptr) {
+    const char* hostPtr = reinterpret_cast<const char*>(ptr);
+    const char* hostMem = reinterpret_cast<const char*>(memObj->getHostMem());
+    //Prepinned memory
+    if ((hostMem != nullptr) &&
+        (hostPtr >= hostMem && hostPtr <= (hostMem + memObj->getSize()))) {
+      offset = reinterpret_cast<size_t>(hostPtr) - reinterpret_cast<size_t>(hostMem);
+    }
+    else {
+      //SVM ptr or device ptr mapped from host
+      const void *devPtr = reinterpret_cast<void*>
+              (memObj->getDeviceMemory(*memObj->getContext().devices()[0])->virtualAddress());
+      if (devPtr != nullptr) {
+        offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(devPtr);
+      }
+      else {
+        ShouldNotReachHere();
+      }
+    }
+  } else {
+    // If memObj not found, use arena_mem_obj. arena_mem_obj is null, if HMM and Xnack is disabled.
+    memObj = (hip::getCurrentDevice()->asContext()->svmDevices()[0])->GetArenaMemObj(ptr, offset);
+  }
+  return memObj;
+}
+
+// ================================================================================================
+amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size) {
+  size_t offset;
+  amd::Memory* memObj = getMemoryObject(ptr, offset);
+
+  if (memObj != nullptr) {
+    assert(size <= (memObj->getSize() - offset));
+    memObj = new (memObj->getContext()) amd::Buffer(*memObj, memObj->getMemFlags(), offset, size);
+    if (memObj == nullptr) {;
+      return nullptr;
+    }
+
+    if (!memObj->create(nullptr)) {
+      memObj->release();
+      return nullptr;
+    }
+  }
+
+  return memObj;
+}
+
+// ================================================================================================
+hipError_t ihipFree(void *ptr)
+{
+  if (ptr == nullptr) {
+    return hipSuccess;
+  }
+
+  size_t offset = 0;
+  amd::Memory* memory_object = getMemoryObject(ptr, offset);
+
+  if (memory_object != nullptr) {
+    // Check if it's an allocation in system memory and can be shared across all devices
+    if (memory_object->getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER) {
+      for (auto& dev : g_devices) {
+        // Skip stream allocation, since if it wasn't allocated until free, then the device
+        // wasn't used
+        constexpr bool SkipStreamAlloc = true;
+        amd::HostQueue* queue = dev->NullStream(SkipStreamAlloc);
+        if (queue != nullptr) {
+          queue->finish();
+        }
+      }
+    } else {
+      // Wait on the device, associated with the current memory object
+      hip::getNullStream(memory_object->getContext())->finish();
+    }
+    amd::SvmBuffer::free(memory_object->getContext(), ptr);
+    return hipSuccess;
+  }
+  return hipErrorInvalidValue;
+}
+
+hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out, const hipExternalMemoryHandleDesc* memHandleDesc) {
+  HIP_INIT_API(hipImportExternalMemory, extMem_out, memHandleDesc);
+
+  size_t sizeBytes = memHandleDesc->size;
+  amd::Context& amdContext = *hip::getCurrentDevice()->asContext();
+
+  amd::BufferVk* pBufferVk = nullptr;
+#ifdef _WIN32
+  pBufferVk = new (amdContext) amd::BufferVk(amdContext, sizeBytes, memHandleDesc->handle.win32.handle);
+#else
+  pBufferVk = new (amdContext) amd::BufferVk(amdContext, sizeBytes, memHandleDesc->handle.fd);
+#endif
+
+  if (!pBufferVk) {
+    HIP_RETURN(hipErrorOutOfMemory);
+  }
+
+  if (!pBufferVk->create()) {
+    pBufferVk->release();
+    HIP_RETURN(hipErrorOutOfMemory);
+  }
+  *extMem_out = pBufferVk;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipExternalMemoryGetMappedBuffer(void **devPtr, hipExternalMemory_t extMem, const hipExternalMemoryBufferDesc *bufferDesc) {
+  HIP_INIT_API(hipExternalMemoryGetMappedBuffer, devPtr, extMem, bufferDesc);
+
+  if (extMem == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::BufferVk *buf = reinterpret_cast<amd::BufferVk*>(extMem);
+  const device::Memory* devMem = buf->getDeviceMemory(*hip::getCurrentDevice()->devices()[0]);
+  if (devMem != nullptr) {
+    *devPtr = reinterpret_cast<void*>(devMem->virtualAddress());
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem) {
+  HIP_INIT_API(hipDestroyExternalMemory, extMem);
+
+  if (extMem == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  reinterpret_cast<amd::BufferVk*>(extMem)->release();
+
+  HIP_RETURN(hipSuccess);
+}
+
+
+hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
+                                      const hipExternalSemaphoreHandleDesc* semHandleDesc)
+{
+  HIP_INIT_API(hipImportExternalSemaphore, extSem_out, semHandleDesc);
+  if (extSem_out == nullptr || semHandleDesc == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+
+#ifdef _WIN32
+  if (device->importExtSemaphore(extSem_out, semHandleDesc->handle.win32.handle)) {
+#else
+  if (device->importExtSemaphore(
+          extSem_out, semHandleDesc->handle.fd)) {
+#endif
+    HIP_RETURN(hipSuccess);
+  }
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+
+hipError_t hipSignalExternalSemaphoresAsync(
+    const hipExternalSemaphore_t* extSemArray, const hipExternalSemaphoreSignalParams* paramsArray,
+    unsigned int numExtSems, hipStream_t stream )
+{
+  HIP_INIT_API(hipSignalExternalSemaphoresAsync, extSemArray, paramsArray, numExtSems, stream);
+  if (extSemArray == nullptr || paramsArray == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::HostQueue* queue = hip::getQueue(stream);
+  const amd::Device& device = queue->vdev()->device();
+
+  for (unsigned int i = 0; i < numExtSems; i++) {
+    if (extSemArray[i] != nullptr) {
+      amd::ExternalSemaphoreCmd* command =
+          new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value,
+                                        amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE);
+      if (command == nullptr) {
+        return hipErrorOutOfMemory;
+      }
+      command->enqueue();
+      command->release();
+    } else {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+        const hipExternalSemaphoreWaitParams* paramsArray,
+        unsigned int numExtSems, hipStream_t stream)
+{
+  HIP_INIT_API(hipWaitExternalSemaphoresAsync, extSemArray, paramsArray, numExtSems,
+               stream);
+  if (extSemArray == nullptr || paramsArray == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::HostQueue* queue = hip::getQueue(stream);
+  const amd::Device& device = queue->vdev()->device();
+
+  for (unsigned int i = 0; i < numExtSems; i++) {
+    if (extSemArray[i] != nullptr) {
+      amd::ExternalSemaphoreCmd* command =
+          new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value,
+                                        amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE);
+      if (command == nullptr) {
+        return hipErrorOutOfMemory;
+      }
+      command->enqueue();
+      command->release();
+    } else {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDestroyExternalSemaphore(hipExternalSemaphore_t extSem)
+{
+  HIP_INIT_API(hipDestroyExternalSemaphore, extSem);
+  if (extSem == nullptr ) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  device->DestroyExtSemaphore(extSem);
+  HIP_RETURN(hipSuccess);
+}
+
+
+// ================================================================================================
+hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
+{
+  if (ptr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  if (sizeBytes == 0) {
+    *ptr = nullptr;
+    return hipSuccess;
+  }
+
+  bool useHostDevice = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0;
+  amd::Context* curDevContext = hip::getCurrentDevice()->asContext();
+  amd::Context* amdContext = useHostDevice ? hip::host_device->asContext() : curDevContext;
+
+  if (amdContext == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (amdContext->devices()[0]->info().maxMemAllocSize_ < sizeBytes) {
+    return hipErrorOutOfMemory;
+  }
+
+  *ptr = amd::SvmBuffer::malloc(*amdContext, flags, sizeBytes, amdContext->devices()[0]->info().memBaseAddrAlign_,
+              useHostDevice ? curDevContext->svmDevices()[0] : nullptr);
+  if (*ptr == nullptr) {
+    size_t free = 0, total =0;
+    hipMemGetInfo(&free, &total);
+    LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total);
+    return hipErrorOutOfMemory;
+  }
+
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes,
+                                      hipMemcpyKind kind) {
+  if (dst == nullptr || src == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  size_t sOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(src, sOffset);
+  size_t dOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
+  // Return error if sizeBytes passed to memcpy is more than the actual size allocated
+  if ((dstMemory && sizeBytes > (dstMemory->getSize() - dOffset)) ||
+      (srcMemory && sizeBytes > (srcMemory->getSize() - sOffset))) {
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
+                             hipMemcpyKind kind, amd::HostQueue& queue) {
+  amd::Command::EventWaitList waitList;
+  size_t sOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(src, sOffset);
+  size_t dOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
+  amd::Device* queueDevice = &queue.device();
+  if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
+    amd::HostQueue* pQueue = &queue;
+    if (queueDevice != dstMemory->getContext().devices()[0]) {
+      pQueue = hip::getNullStream(dstMemory->getContext());
+      amd::Command* cmd = queue.getLastQueuedCommand(true);
+      if (cmd != nullptr) {
+        waitList.push_back(cmd);
+      }
+    }
+    command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList,
+              *dstMemory->asBuffer(), dOffset, sizeBytes, src);
+  } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
+    amd::HostQueue* pQueue = &queue;
+    if (queueDevice != srcMemory->getContext().devices()[0]) {
+      pQueue = hip::getNullStream(srcMemory->getContext());
+      amd::Command* cmd = queue.getLastQueuedCommand(true);
+      if (cmd != nullptr) {
+        waitList.push_back(cmd);
+      }
+    }
+    command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList,
+              *srcMemory->asBuffer(), sOffset, sizeBytes, dst);
+  } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
+    // Check if the queue device doesn't match the device on any memory object.
+    // And any of them are not host allocation.
+    // Hence it's a P2P transfer, because the app has requested access to another GPU
+    if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) &&
+        ((srcMemory->getContext().devices().size() == 1) &&
+         (dstMemory->getContext().devices().size() == 1))) {
+      command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList,
+          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
+      if (command == nullptr) {
+        return hipErrorOutOfMemory;
+      }
+      // Make sure runtime has valid memory for the command execution. P2P access
+      // requires page table mapping on the current device to another GPU memory
+      if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
+        delete command;
+        return hipErrorInvalidValue;
+      }
+    } else {
+      amd::HostQueue* pQueue = &queue;
+      if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
+          (queueDevice != srcMemory->getContext().devices()[0])) {
+        pQueue = hip::getNullStream(srcMemory->getContext());
+        amd::Command* cmd = queue.getLastQueuedCommand(true);
+        if (cmd != nullptr) {
+          waitList.push_back(cmd);
+        }
+      } else if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
+        // Scenarios such as DtoH where dst is pinned memory
+        if ((queueDevice != srcMemory->getContext().devices()[0]) &&
+            (dstMemory->getContext().devices().size() != 1)) {
+          pQueue = hip::getNullStream(srcMemory->getContext());
+          amd::Command* cmd = queue.getLastQueuedCommand(true);
+          if (cmd != nullptr) {
+            waitList.push_back(cmd);
+          }
+        // Scenarios such as HtoD where src is pinned memory
+        } else if ((queueDevice != dstMemory->getContext().devices()[0]) &&
+                   (srcMemory->getContext().devices().size() != 1)) {
+          pQueue = hip::getNullStream(dstMemory->getContext());
+          amd::Command* cmd = queue.getLastQueuedCommand(true);
+          if (cmd != nullptr) {
+            waitList.push_back(cmd);
+          }
+        }
+      }
+      command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
+          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
+    }
+  }
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  if (waitList.size() > 0) {
+    waitList[0]->release();
+  }
+  return hipSuccess;
+}
+
+// ================================================================================================
+hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                      amd::HostQueue& queue, bool isAsync = false) {
+  hipError_t status;
+  if (sizeBytes == 0) {
+    // Skip if nothing needs writing.
+    return hipSuccess;
+  }
+  status = ihipMemcpy_validate(dst, src, sizeBytes, kind);
+  if (status != hipSuccess) {
+    return status;
+  }
+  size_t sOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(src, sOffset);
+  size_t dOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
+  if ((srcMemory == nullptr) && (dstMemory == nullptr)) {
+    if ((kind == hipMemcpyHostToHost) || (kind == hipMemcpyDefault)) {
+      queue.finish();
+      memcpy(dst, src, sizeBytes);
+      return hipSuccess;
+    } else {
+      return hipErrorInvalidValue;
+    }
+  } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
+    isAsync = false;
+  } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
+    isAsync = false;
+  }
+  amd::Command* command = nullptr;
+  status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, queue);
+  if (status != hipSuccess) {
+    return status;
+  }
+  command->enqueue();
+  if (!isAsync) {
+    command->awaitCompletion();
+  } else {
+    amd::HostQueue* newQueue = command->queue();
+    if (newQueue != &queue) {
+      amd::Command::EventWaitList waitList;
+      amd::Command* cmd = newQueue->getLastQueuedCommand(true);
+      if (cmd != nullptr) {
+        waitList.push_back(cmd);
+        amd::Command* depdentMarker = new amd::Marker(queue, true, waitList);
+        if (depdentMarker != nullptr) {
+          depdentMarker->enqueue();
+          depdentMarker->release();
+        }
+        cmd->release();
+      }
+    }
+  }
+  command->release();
+  return hipSuccess;
+}
+
+// ================================================================================================
+hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags) {
+  HIP_INIT_API(hipExtMallocWithFlags, ptr, sizeBytes, flags);
+
+  unsigned int ihipFlags = 0;
+  if (flags == hipDeviceMallocDefault) {
+    ihipFlags = 0;
+  } else if (flags == hipDeviceMallocFinegrained) {
+    ihipFlags = CL_MEM_SVM_ATOMICS;
+  } else if (flags == hipMallocSignalMemory) {
+    ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY;
+    if (sizeBytes != 8) {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(ihipMalloc(ptr, sizeBytes, ihipFlags), (ptr != nullptr)? *ptr : nullptr);
+}
+
+hipError_t hipMalloc(void** ptr, size_t sizeBytes) {
+  HIP_INIT_API(hipMalloc, ptr, sizeBytes);
+
+  HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, 0), (ptr != nullptr)? *ptr : nullptr);
+}
+
+hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
+  HIP_INIT_API(hipHostMalloc, ptr, sizeBytes, flags);
+
+  if (ptr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  *ptr = nullptr;
+
+  const unsigned int coherentFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
+
+  // can't have both Coherent and NonCoherent flags set at the same time
+  if ((flags & coherentFlags) == coherentFlags) {
+    LogPrintfError(
+        "Cannot have both coherent and non-coherent flags "
+        "at the same time, flags: %u coherent flags: %u \n",
+        flags, coherentFlags);
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  unsigned int ihipFlags = CL_MEM_SVM_FINE_GRAIN_BUFFER | (flags << 16);
+  if (flags == 0 ||
+      flags & (hipHostMallocCoherent | hipHostMallocMapped) ||
+     (!(flags & hipHostMallocNonCoherent) && HIP_HOST_COHERENT)) {
+    ihipFlags |= CL_MEM_SVM_ATOMICS;
+  }
+
+  if (flags & hipHostMallocNumaUser) {
+    ihipFlags |= CL_MEM_FOLLOW_USER_NUMA_POLICY;
+  }
+
+  HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr);
+}
+
+hipError_t hipFree(void* ptr) {
+  HIP_INIT_API(hipFree, ptr);
+
+  HIP_RETURN(ihipFree(ptr));
+}
+
+hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind);
+
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue));
+}
+
+hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
+                               hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyWithStream, dst, src, sizeBytes, kind, stream);
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false));
+}
+
+hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) {
+  HIP_INIT_API(hipMemPtrGetInfo, ptr, size);
+
+  size_t offset = 0;
+  amd::Memory* svmMem = getMemoryObject(ptr, offset);
+
+  if (svmMem == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *size = svmMem->getSize();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipHostFree(void* ptr) {
+  HIP_INIT_API(hipHostFree, ptr);
+
+  HIP_RETURN(ihipFree(ptr));
+}
+
+hipError_t ihipArrayDestroy(hipArray* array) {
+  if (array == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  cl_mem memObj = reinterpret_cast<cl_mem>(array->data);
+  if (is_valid(memObj) == false) {
+    return hipErrorInvalidValue;
+  }
+  for (auto& dev : g_devices) {
+    dev->NullStream()->finish();
+  }
+  as_amd(memObj)->release();
+
+  delete array;
+
+  return hipSuccess;
+}
+
+hipError_t hipFreeArray(hipArray* array) {
+  HIP_INIT_API(hipFreeArray, array);
+
+  HIP_RETURN(ihipArrayDestroy(array));
+}
+
+hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr) {
+  HIP_INIT_API(hipMemGetAddressRange, pbase, psize, dptr);
+
+  // Since we are using SVM buffer DevicePtr and HostPtr is the same
+  void* ptr = dptr;
+  size_t offset = 0;
+  amd::Memory* svmMem = getMemoryObject(ptr, offset);
+
+  if (svmMem == nullptr) {
+    HIP_RETURN(hipErrorInvalidDevicePointer);
+  }
+
+  *pbase = svmMem->getSvmPtr();
+  *psize = svmMem->getSize();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipMemGetInfo(size_t* free, size_t* total) {
+  HIP_INIT_API(hipMemGetInfo, free, total);
+
+  size_t freeMemory[2];
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  if(device == nullptr) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if(!device->globalFreeMemory(freeMemory)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *free = freeMemory[0] * Ki;
+  *total = device->info().globalMemSize_;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t ihipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height, size_t depth,
+                           cl_mem_object_type imageType, const cl_image_format* image_format) {
+
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+
+  if (ptr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  if ((width == 0) || (height == 0) || (depth == 0)) {
+    *ptr = nullptr;
+    return hipSuccess;
+  }
+
+  const amd::Image::Format imageFormat(*image_format);
+
+  *pitch = amd::alignUp(width * imageFormat.getElementSize(), device->info().imagePitchAlignment_);
+
+  size_t sizeBytes = *pitch * height * depth;
+
+  if (device->info().maxMemAllocSize_ < sizeBytes) {
+    return hipErrorOutOfMemory;
+  }
+
+  *ptr = amd::SvmBuffer::malloc(*hip::getCurrentDevice()->asContext(), 0, sizeBytes,
+                                device->info().memBaseAddrAlign_);
+
+  if (*ptr == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  return hipSuccess;
+}
+
+
+hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+  HIP_INIT_API(hipMallocPitch, ptr, pitch, width, height);
+
+  const cl_image_format image_format = { CL_R, CL_UNSIGNED_INT8 };
+  HIP_RETURN(ihipMallocPitch(ptr, pitch, width, height, 1, CL_MEM_OBJECT_IMAGE2D, &image_format), (ptr != nullptr)? *ptr : nullptr);
+}
+
+hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
+  HIP_INIT_API(hipMalloc3D, pitchedDevPtr, extent);
+
+  size_t pitch = 0;
+
+  if (pitchedDevPtr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const cl_image_format image_format = { CL_R, CL_UNSIGNED_INT8 };
+  hipError_t status = hipSuccess;
+  status = ihipMallocPitch(&pitchedDevPtr->ptr, &pitch, extent.width, extent.height, extent.depth,
+                           CL_MEM_OBJECT_IMAGE3D, &image_format);
+
+  if (status == hipSuccess) {
+        pitchedDevPtr->pitch = pitch;
+        pitchedDevPtr->xsize = extent.width;
+        pitchedDevPtr->ysize = extent.height;
+  }
+
+  HIP_RETURN(status, *pitchedDevPtr);
+}
+
+amd::Image* ihipImageCreate(const cl_channel_order channelOrder,
+                            const cl_channel_type channelType,
+                            const cl_mem_object_type imageType,
+                            const size_t imageWidth,
+                            const size_t imageHeight,
+                            const size_t imageDepth,
+                            const size_t imageArraySize,
+                            const size_t imageRowPitch,
+                            const size_t imageSlicePitch,
+                            const uint32_t numMipLevels,
+                            amd::Memory* buffer) {
+  const amd::Image::Format imageFormat({channelOrder, channelType});
+  if (!imageFormat.isValid()) {
+    LogPrintfError("Invalid Image format for channel Order:%u Type:%u \n", channelOrder,
+                   channelType);
+    return nullptr;
+  }
+
+  amd::Context& context = *hip::getCurrentDevice()->asContext();
+  if (!imageFormat.isSupported(context, imageType)) {
+    LogPrintfError("Image type: %u not supported \n", imageType);
+    return nullptr;
+  }
+
+  const std::vector<amd::Device*>& devices = context.devices();
+  if (!devices[0]->info().imageSupport_) {
+    LogPrintfError("Device: 0x%x does not support image \n", devices[0]);
+    return nullptr;
+  }
+
+  bool mipMapSupport = true;
+  for (auto& dev : devices) {
+    if (!dev->settings().checkExtension(ClKhrMipMapImage)) {
+      mipMapSupport = false;
+    }
+  }
+
+  if (!amd::Image::validateDimensions(devices,
+                                      imageType,
+                                      imageWidth,
+                                      imageHeight,
+                                      imageDepth,
+                                      imageArraySize)) {
+    DevLogError("Image does not have valid dimensions \n");
+    return nullptr;
+  }
+
+  if (numMipLevels > 0) {
+    if (mipMapSupport == true) {
+      size_t max_dim = std::max(std::max(imageWidth, imageHeight), imageDepth);
+      size_t mip_levels = 0;
+      for (mip_levels = 0; max_dim > 0; max_dim >>=1, mip_levels++);
+        // empty for loop
+
+      if (mip_levels < numMipLevels) {
+        LogPrintfError("Invalid Mip Levels: %d", numMipLevels);
+        return nullptr;
+      }
+    } else {
+      LogPrintfError("Mipmap not supported on one of the devices, Mip Level: %d", numMipLevels);
+      return nullptr;
+    }
+  }
+
+  // TODO validate the image descriptor.
+
+  amd::Image* image = nullptr;
+  if (buffer != nullptr) {
+    switch (imageType) {
+    case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+    case CL_MEM_OBJECT_IMAGE2D:
+      image = new (context) amd::Image(*buffer->asBuffer(),
+                                       imageType,
+                                       CL_MEM_READ_WRITE,
+                                       imageFormat,
+                                       imageWidth,
+                                       (imageHeight == 0) ? 1 : imageHeight,
+                                       (imageDepth == 0) ? 1 : imageDepth,
+                                       imageRowPitch,
+                                       imageSlicePitch);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  } else {
+    switch (imageType) {
+    case CL_MEM_OBJECT_IMAGE1D:
+    case CL_MEM_OBJECT_IMAGE2D:
+    case CL_MEM_OBJECT_IMAGE3D:
+      image = new (context) amd::Image(context,
+                                      imageType,
+                                      CL_MEM_READ_WRITE,
+                                      imageFormat,
+                                      imageWidth,
+                                      (imageHeight == 0) ? 1 : imageHeight,
+                                      (imageDepth == 0) ? 1 : imageDepth,
+                                      imageWidth * imageFormat.getElementSize(), /* row pitch */
+                                      imageWidth * imageHeight * imageFormat.getElementSize(), /* slice pitch */
+                                      numMipLevels);
+      break;
+    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+      image = new (context) amd::Image(context,
+                                       imageType,
+                                       CL_MEM_READ_WRITE,
+                                       imageFormat,
+                                       imageWidth,
+                                       imageArraySize,
+                                       1, /* image depth */
+                                       imageWidth * imageFormat.getElementSize(),
+                                       imageWidth * imageHeight * imageFormat.getElementSize(),
+                                       numMipLevels);
+      break;
+    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+      image = new (context) amd::Image(context,
+                                       imageType,
+                                       CL_MEM_READ_WRITE,
+                                       imageFormat,
+                                       imageWidth,
+                                       imageHeight,
+                                       imageArraySize,
+                                       imageWidth * imageFormat.getElementSize(),
+                                       imageWidth * imageHeight * imageFormat.getElementSize(),
+                                       numMipLevels);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  }
+
+  if (image == nullptr) {
+    return nullptr;
+  }
+
+  if (!image->create(nullptr)) {
+    LogPrintfError("Cannot create image: 0x%x \n", image);
+    delete image;
+    return nullptr;
+  }
+
+  return image;
+}
+
+hipError_t ihipArrayCreate(hipArray** array,
+                           const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray,
+                           unsigned int numMipmapLevels) {
+  // NumChannels specifies the number of packed components per HIP array element; it may be 1, 2, or 4;
+  if ((pAllocateArray->NumChannels != 1) &&
+      (pAllocateArray->NumChannels != 2) &&
+      (pAllocateArray->NumChannels != 4)) {
+    return hipErrorInvalidValue;
+  }
+
+  if ((pAllocateArray->Flags & hipArraySurfaceLoadStore) ||
+      (pAllocateArray->Flags & hipArrayCubemap) ||
+      (pAllocateArray->Flags & hipArrayTextureGather)) {
+    return hipErrorNotSupported;
+  }
+
+  const cl_channel_order channelOrder = hip::getCLChannelOrder(pAllocateArray->NumChannels, 0);
+  const cl_channel_type channelType = hip::getCLChannelType(pAllocateArray->Format, hipReadModeElementType);
+  const cl_mem_object_type imageType = hip::getCLMemObjectType(pAllocateArray->Width,
+                                                               pAllocateArray->Height,
+                                                               pAllocateArray->Depth,
+                                                               pAllocateArray->Flags);
+
+  amd::Image* image = ihipImageCreate(channelOrder,
+                                      channelType,
+                                      imageType,
+                                      pAllocateArray->Width,
+                                      pAllocateArray->Height,
+                                      pAllocateArray->Depth,
+                                      // The number of layers is determined by the depth extent.
+                                      pAllocateArray->Depth, /* array size */
+                                      0, /* row pitch */
+                                      0, /* slice pitch */
+                                      numMipmapLevels,
+                                      nullptr /* buffer */);
+
+  if (image == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  cl_mem memObj = as_cl<amd::Memory>(image);
+  *array = new hipArray{reinterpret_cast<void*>(memObj)};
+
+  // It is UB to call hipGet*() on an array created via hipArrayCreate()/hipArray3DCreate().
+  // This is due to hip not differentiating between runtime and driver types.
+  // TODO change the hipArray struct in driver_types.h.
+  (*array)->desc = hip::getChannelFormatDesc(pAllocateArray->NumChannels, pAllocateArray->Format);
+  (*array)->width = pAllocateArray->Width;
+  (*array)->height = pAllocateArray->Height;
+  (*array)->depth = pAllocateArray->Depth;
+  (*array)->Format = pAllocateArray->Format;
+  (*array)->NumChannels = pAllocateArray->NumChannels;
+
+  return hipSuccess;
+}
+
+hipError_t hipArrayCreate(hipArray** array,
+                          const HIP_ARRAY_DESCRIPTOR* pAllocateArray) {
+  HIP_INIT_API(hipArrayCreate, array, pAllocateArray);
+
+  HIP_ARRAY3D_DESCRIPTOR desc = {pAllocateArray->Width,
+                                 pAllocateArray->Height,
+                                 0, /* Depth */
+                                 pAllocateArray->Format,
+                                 pAllocateArray->NumChannels,
+                                 hipArrayDefault /* Flags */};
+
+  HIP_RETURN(ihipArrayCreate(array, &desc, 0));
+}
+
+
+hipError_t hipMallocArray(hipArray** array,
+                          const hipChannelFormatDesc* desc,
+                          size_t width,
+                          size_t height,
+                          unsigned int flags) {
+  HIP_INIT_API(hipMallocArray, array, desc, width, height, flags);
+
+  HIP_ARRAY3D_DESCRIPTOR allocateArray = {width,
+                                          height,
+                                          0, /* Depth */
+                                          hip::getArrayFormat(*desc),
+                                          hip::getNumChannels(*desc),
+                                          flags};
+
+  HIP_RETURN(ihipArrayCreate(array, &allocateArray, 0 /* numMipLevels */));
+}
+
+hipError_t hipArray3DCreate(hipArray** array,
+                            const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray) {
+  HIP_INIT_API(hipArray3DCreate, array, pAllocateArray);
+
+  HIP_RETURN(ihipArrayCreate(array, pAllocateArray, 0 /* numMipLevels */));
+}
+
+hipError_t hipMalloc3DArray(hipArray_t* array,
+                            const hipChannelFormatDesc* desc,
+                            hipExtent extent,
+                            unsigned int flags) {
+  HIP_INIT_API(hipMalloc3DArray, array, desc, extent, flags);
+
+  HIP_ARRAY3D_DESCRIPTOR allocateArray = {extent.width,
+                                          extent.height,
+                                          extent.depth,
+                                          hip::getArrayFormat(*desc),
+                                          hip::getNumChannels(*desc),
+                                          flags};
+
+  HIP_RETURN(ihipArrayCreate(array, &allocateArray, 0));
+}
+
+hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
+  HIP_INIT_API(hipHostGetFlags, flagsPtr, hostPtr);
+
+  if (flagsPtr == nullptr || hostPtr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  size_t offset = 0;
+  amd::Memory* svmMem = getMemoryObject(hostPtr, offset);
+
+  if (svmMem == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *flagsPtr = svmMem->getMemFlags() >> 16;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) {
+  HIP_INIT_API(hipHostRegister, hostPtr, sizeBytes, flags);
+  if(hostPtr != nullptr) {
+    amd::Memory* mem = new (*hip::host_device->asContext()) amd::Buffer(*hip::host_device->asContext(), CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS, sizeBytes);
+
+    constexpr bool sysMemAlloc = false;
+    constexpr bool skipAlloc = false;
+    constexpr bool forceAlloc = true;
+    if (!mem->create(hostPtr, sysMemAlloc, skipAlloc, forceAlloc)) {
+      mem->release();
+      LogPrintfError("Cannot create memory for size: %u with flags: %d \n", sizeBytes, flags);
+      HIP_RETURN(hipErrorOutOfMemory);
+    }
+
+    for (const auto& device: hip::getCurrentDevice()->devices()) {
+      // Since the amd::Memory object is shared between all devices
+      // it's fine to have multiple addresses mapped to it
+      const device::Memory* devMem = mem->getDeviceMemory(*device);
+      amd::MemObjMap::AddMemObj(reinterpret_cast<void*>(devMem->virtualAddress()), mem);
+    }
+
+    amd::MemObjMap::AddMemObj(hostPtr, mem);
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN_DURATION(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr);
+  }
+}
+
+hipError_t hipHostUnregister(void* hostPtr) {
+  HIP_INIT_API(hipHostUnregister, hostPtr);
+
+  for (auto& dev : g_devices) {
+    dev->NullStream()->finish();
+  }
+
+  if (amd::SvmBuffer::malloced(hostPtr)) {
+    amd::SvmBuffer::free(*hip::host_device->asContext(), hostPtr);
+    HIP_RETURN(hipSuccess);
+  } else {
+    size_t offset = 0;
+    amd::Memory* mem = getMemoryObject(hostPtr, offset);
+
+    if(mem) {
+      for (const auto& device: g_devices) {
+        const device::Memory* devMem = mem->getDeviceMemory(*device->devices()[0]);
+        if (devMem != nullptr) {
+          void* vAddr = reinterpret_cast<void*>(devMem->virtualAddress());
+          if (amd::MemObjMap::FindMemObj(vAddr)) {
+            amd::MemObjMap::RemoveMemObj(vAddr);
+          }
+        }
+      }
+      amd::MemObjMap::RemoveMemObj(hostPtr);
+      mem->release();
+      HIP_RETURN(hipSuccess);
+    }
+  }
+
+  LogPrintfError("Cannot unregister host_ptr: 0x%x \n", hostPtr);
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+// Deprecated function:
+hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) {
+  HIP_INIT_API(hipHostAlloc, ptr, sizeBytes, flags);
+
+  HIP_RETURN(ihipMalloc(ptr, sizeBytes, flags), (ptr != nullptr)? *ptr : nullptr);
+};
+
+inline hipError_t ihipMemcpySymbol_validate(const void* symbol, size_t sizeBytes, size_t offset, size_t &sym_size, hipDeviceptr_t &device_ptr) {
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, &sym_size));
+
+  /* Size Check to make sure offset is correct */
+  if ((offset + sizeBytes) > sym_size) {
+    LogPrintfError("Trying to access out of bounds, offset: %u sizeBytes: %u sym_size: %u \n",
+                   offset, sizeBytes, sym_size);
+    HIP_RETURN(hipErrorInvalidDevicePointer);
+  }
+
+  device_ptr = reinterpret_cast<address>(device_ptr) + offset;
+  return hipSuccess;
+}
+
+hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+                             size_t offset, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpyToSymbol, symbol, src, sizeBytes, offset, kind);
+
+  size_t sym_size = 0;
+  hipDeviceptr_t device_ptr = nullptr;
+
+  hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr);
+  if (status != hipSuccess) {
+    return status;
+  }
+
+  /* Copy memory from source to destination address */
+  HIP_RETURN_DURATION(hipMemcpy(device_ptr, src, sizeBytes, kind));
+}
+
+hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes,
+                               size_t offset, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpyFromSymbol, symbol, dst, sizeBytes, offset, kind);
+
+  size_t sym_size = 0;
+  hipDeviceptr_t device_ptr = nullptr;
+
+  hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr);
+  if (status != hipSuccess) {
+    return status;
+  }
+
+  /* Copy memory from source to destination address */
+  HIP_RETURN_DURATION(hipMemcpy(dst, device_ptr, sizeBytes, kind));
+}
+
+hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes,
+                                  size_t offset, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyToSymbolAsync, symbol, src, sizeBytes, offset, kind, stream);
+
+  STREAM_CAPTURE(hipMemcpyToSymbolAsync, stream, symbol, src, sizeBytes, offset, kind);
+
+  size_t sym_size = 0;
+  hipDeviceptr_t device_ptr = nullptr;
+
+  hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr);
+  if (status != hipSuccess) {
+    return status;
+  }
+  /* Copy memory from source to destination address */
+  HIP_RETURN_DURATION(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream));
+}
+
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBytes,
+                                    size_t offset, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyFromSymbolAsync, symbol, dst, sizeBytes, offset, kind, stream);
+
+  STREAM_CAPTURE(hipMemcpyFromSymbolAsync, stream, dst, symbol, sizeBytes, offset, kind);
+
+  size_t sym_size = 0;
+  hipDeviceptr_t device_ptr = nullptr;
+
+  hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr);
+  if (status != hipSuccess) {
+    return status;
+  }
+
+  /* Copy memory from source to destination address */
+  HIP_RETURN_DURATION(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream));
+}
+
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice,
+                         void* srcHost,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr)));
+}
+
+hipError_t hipMemcpyDtoH(void* dstHost,
+                         hipDeviceptr_t srcDevice,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr)));
+}
+
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice,
+                         hipDeviceptr_t srcDevice,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr)));
+}
+
+hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
+                          hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyAsync, dst, src, sizeBytes, kind, stream);
+
+  STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind);
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true));
+}
+
+hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
+                              void* srcHost,
+                              size_t ByteCount,
+                              hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true));
+}
+
+hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice,
+                              hipDeviceptr_t srcDevice,
+                              size_t ByteCount,
+                              hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true));
+}
+
+hipError_t hipMemcpyDtoHAsync(void* dstHost,
+                              hipDeviceptr_t srcDevice,
+                              size_t ByteCount,
+                              hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true));
+}
+
+hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, void* dstDevice,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch,
+                                 amd::HostQueue* queue) {
+  size_t dstOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dstDevice, dstOffset);
+  if (srcArray == nullptr || (dstMemory == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+  cl_mem srcMemObj = reinterpret_cast<cl_mem>(srcArray->data);
+  if (!is_valid(srcMemObj)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* srcImage = as_amd(srcMemObj)->asImage();
+  // HIP assumes the width is in bytes, but OCL assumes it's in pixels.
+  const size_t elementSize = srcImage->getImageFormat().getElementSize();
+  static_cast<size_t*>(srcOrigin)[0] /= elementSize;
+  static_cast<size_t*>(copyRegion)[0] /= elementSize;
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcImage->getRowPitch(), srcImage->getSlicePitch())) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  dstRect.start_ += dstOffset;
+  dstRect.end_ += dstOffset;
+
+  const size_t copySizeInBytes =
+      copyRegion[0] * copyRegion[1] * copyRegion[2] * srcImage->getImageFormat().getElementSize();
+  if (!srcImage->validateRegion(srcOrigin, copyRegion) ||
+      !dstMemory->validateRegion(dstOrigin, {copySizeInBytes, 0, 0})) {
+    return hipErrorInvalidValue;
+  }
+
+  command = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE_TO_BUFFER,
+                                        amd::Command::EventWaitList{}, *srcImage, *dstMemory,
+                                        srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArray* dstArray,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
+                                 amd::HostQueue* queue) {
+  size_t srcOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(srcDevice, srcOffset);
+  if ((srcMemory == nullptr) || dstArray == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  cl_mem dstMemObj = reinterpret_cast<cl_mem>(dstArray->data);
+  if (!is_valid(dstMemObj)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* dstImage = as_amd(dstMemObj)->asImage();
+  // HIP assumes the width is in bytes, but OCL assumes it's in pixels.
+  const size_t elementSize = dstImage->getImageFormat().getElementSize();
+  static_cast<size_t*>(dstOrigin)[0] /= elementSize;
+  static_cast<size_t*>(copyRegion)[0] /= elementSize;
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  srcRect.start_ += srcOffset;
+  srcRect.end_ += srcOffset;
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstImage->getRowPitch(), dstImage->getSlicePitch())) {
+    return hipErrorInvalidValue;
+  }
+
+  const size_t copySizeInBytes =
+      copyRegion[0] * copyRegion[1] * copyRegion[2] * dstImage->getImageFormat().getElementSize();
+  if (!srcMemory->validateRegion(srcOrigin, {copySizeInBytes, 0, 0}) ||
+      !dstImage->validateRegion(dstOrigin, copyRegion)) {
+    return hipErrorInvalidValue;
+  }
+
+  command = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_BUFFER_TO_IMAGE,
+                                        amd::Command::EventWaitList{}, *srcMemory, *dstImage,
+                                        srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* dstDevice,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
+                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) {
+  size_t srcOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(srcDevice, srcOffset);
+  size_t dstOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dstDevice, dstOffset);
+
+  if ((srcMemory == nullptr) || (dstMemory == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  srcRect.start_ += srcOffset;
+  srcRect.end_ += srcOffset;
+
+  amd::Coord3D srcStart(srcRect.start_, 0, 0);
+  amd::Coord3D srcSize(srcRect.end_ - srcRect.start_, 1, 1);
+  if (!srcMemory->validateRegion(srcStart, srcSize)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  dstRect.start_ += dstOffset;
+  dstRect.end_ += dstOffset;
+
+  amd::Coord3D dstStart(dstRect.start_, 0, 0);
+  amd::Coord3D dstSize(dstRect.end_ - dstRect.start_, 1, 1);
+  if (!dstMemory->validateRegion(dstStart, dstSize)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand(
+      *queue, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory,
+      srcStart, dstStart, copyRegion, srcRect, dstRect);
+
+  if (copyCommand == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (!copyCommand->validatePeerMemory()) {
+    delete copyCommand;
+    return hipErrorInvalidValue;
+  }
+  command = copyCommand;
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* dstHost,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
+                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) {
+  size_t srcOffset = 0;
+  amd::Memory* srcMemory = getMemoryObject(srcDevice, srcOffset);
+
+  if ((srcMemory == nullptr) || (dstHost == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  srcRect.start_ += srcOffset;
+  srcRect.end_ += srcOffset;
+
+  amd::Coord3D srcStart(srcRect.start_, 0, 0);
+  amd::Coord3D srcSize(srcRect.end_ - srcRect.start_, 1, 1);
+  if (!srcMemory->validateRegion(srcStart, srcSize)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::ReadMemoryCommand* readCommand =
+      new amd::ReadMemoryCommand(*queue, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{},
+                                 *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect);
+
+  if (readCommand == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (!readCommand->validatePeerMemory()) {
+    delete readCommand;
+    return hipErrorInvalidValue;
+  }
+  command = readCommand;
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, void* dstDevice,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
+                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) {
+  size_t dstOffset = 0;
+  amd::Memory* dstMemory = getMemoryObject(dstDevice, dstOffset);
+
+  if ((srcHost == nullptr) || (dstMemory == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+  dstRect.start_ += dstOffset;
+  dstRect.end_ += dstOffset;
+
+  amd::Coord3D dstStart(dstRect.start_, 0, 0);
+  amd::Coord3D dstSize(dstRect.end_ - dstRect.start_, 1, 1);
+  if (!dstMemory->validateRegion(dstStart, dstSize)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand(
+      *queue, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart,
+      copyRegion, srcHost, dstRect, srcRect);
+
+  if (writeCommand == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (!writeCommand->validatePeerMemory()) {
+    delete writeCommand;
+    return hipErrorInvalidValue;
+  }
+  command = writeCommand;
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch) {
+  if ((srcHost == nullptr) || (dstHost == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  for (size_t slice = 0; slice < copyRegion[2]; slice++) {
+    for (size_t row = 0; row < copyRegion[1]; row++) {
+      const void* srcRow = static_cast<const char*>(srcHost) + srcRect.start_ +
+          row * srcRect.rowPitch_ + slice * srcRect.slicePitch_;
+      void* dstRow = static_cast<char*>(dstHost) + dstRect.start_ + row * dstRect.rowPitch_ +
+          slice * dstRect.slicePitch_;
+      std::memcpy(dstRow, srcRow, copyRegion[0]);
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hipArray* dstArray,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, amd::HostQueue* queue) {
+  if (dstArray == nullptr || srcArray == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  cl_mem srcMemObj = reinterpret_cast<cl_mem>(srcArray->data);
+  cl_mem dstMemObj = reinterpret_cast<cl_mem>(dstArray->data);
+  if (!is_valid(srcMemObj) || !is_valid(dstMemObj)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* srcImage = as_amd(srcMemObj)->asImage();
+  amd::Image* dstImage = as_amd(dstMemObj)->asImage();
+
+  // HIP assumes the width is in bytes, but OCL assumes it's in pixels.
+  // Note that src and dst should have the same element size.
+  assert(srcImage->getImageFormat().getElementSize() ==
+         dstImage->getImageFormat().getElementSize());
+  const size_t elementSize = srcImage->getImageFormat().getElementSize();
+  static_cast<size_t*>(srcOrigin)[0] /= elementSize;
+  static_cast<size_t*>(dstOrigin)[0] /= elementSize;
+  static_cast<size_t*>(copyRegion)[0] /= elementSize;
+
+  if (!srcImage->validateRegion(srcOrigin, copyRegion) ||
+      !dstImage->validateRegion(dstOrigin, copyRegion)) {
+    return hipErrorInvalidValue;
+  }
+
+  command =
+      new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE, amd::Command::EventWaitList{},
+                                 *srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hipArray* dstArray,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
+                                 amd::HostQueue* queue) {
+  if ((srcHost == nullptr) || dstArray == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  cl_mem dstMemObj = reinterpret_cast<cl_mem>(dstArray->data);
+  if (!is_valid(dstMemObj)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect srcRect;
+  if (!srcRect.create(static_cast<size_t*>(srcOrigin), static_cast<size_t*>(copyRegion),
+                      srcRowPitch, srcSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* dstImage = as_amd(dstMemObj)->asImage();
+  // HIP assumes the width is in bytes, but OCL assumes it's in pixels.
+  const size_t elementSize = dstImage->getImageFormat().getElementSize();
+  static_cast<size_t*>(dstOrigin)[0] /= elementSize;
+  static_cast<size_t*>(copyRegion)[0] /= elementSize;
+
+  if (!dstImage->validateRegion(dstOrigin, copyRegion)) {
+    return hipErrorInvalidValue;
+  }
+
+  command = new amd::WriteMemoryCommand(
+      *queue, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin,
+      copyRegion, static_cast<const char*>(srcHost) + srcRect.start_, srcRowPitch, srcSlicePitch);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, void* dstHost,
+                                 amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
+                                 amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch,
+                                 amd::HostQueue* queue) {
+  if (srcArray == nullptr || (dstHost == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+  cl_mem srcMemObj = reinterpret_cast<cl_mem>(srcArray->data);
+  if (!is_valid(srcMemObj)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::BufferRect dstRect;
+  if (!dstRect.create(static_cast<size_t*>(dstOrigin), static_cast<size_t*>(copyRegion),
+                      dstRowPitch, dstSlicePitch)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* srcImage = as_amd(srcMemObj)->asImage();
+  // HIP assumes the width is in bytes, but OCL assumes it's in pixels.
+  const size_t elementSize = srcImage->getImageFormat().getElementSize();
+  static_cast<size_t*>(srcOrigin)[0] /= elementSize;
+  static_cast<size_t*>(copyRegion)[0] /= elementSize;
+
+  if (!srcImage->validateRegion(srcOrigin, copyRegion) ||
+      !srcImage->isRowSliceValid(dstRowPitch, dstSlicePitch, copyRegion[0], copyRegion[1])) {
+    return hipErrorInvalidValue;
+  }
+
+  command = new amd::ReadMemoryCommand(
+      *queue, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin,
+      copyRegion, static_cast<char*>(dstHost) + dstRect.start_, dstRowPitch, dstSlicePitch);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3D* pCopy,
+                                       amd::HostQueue* queue) {
+  // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the
+  // (unified virtual address space) base address of the source data and the bytes per row to apply.
+  // {src/dst}Array is ignored.
+  hipMemoryType srcMemoryType = pCopy->srcMemoryType;
+  if (srcMemoryType == hipMemoryTypeUnified) {
+    srcMemoryType =
+        amd::MemObjMap::FindMemObj(pCopy->srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system
+      // memory.
+      const_cast<HIP_MEMCPY3D*>(pCopy)->srcHost = pCopy->srcDevice;
+    }
+  }
+  hipMemoryType dstMemoryType = pCopy->dstMemoryType;
+  if (dstMemoryType == hipMemoryTypeUnified) {
+    dstMemoryType =
+        amd::MemObjMap::FindMemObj(pCopy->dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstHost = pCopy->dstDevice;
+    }
+  }
+
+  // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned.
+  // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning.
+  if (srcMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->srcHost);
+    srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeDevice) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->srcDevice = const_cast<void*>(pCopy->srcHost);
+    }
+  }
+  if (dstMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->dstHost);
+    dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (dstMemoryType == hipMemoryTypeDevice) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstDevice = const_cast<void*>(pCopy->dstDevice);
+    }
+  }
+
+  amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ};
+  amd::Coord3D dstOrigin = {pCopy->dstXInBytes, pCopy->dstY, pCopy->dstZ};
+  amd::Coord3D copyRegion = {pCopy->WidthInBytes, pCopy->Height, pCopy->Depth};
+
+  if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeDevice)) {
+    // Host to Device.
+    return ihipMemcpyHtoDCommand(command, pCopy->srcHost, pCopy->dstDevice, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) {
+    // Device to Host.
+    return ihipMemcpyDtoHCommand(command, pCopy->srcDevice, pCopy->dstHost, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) {
+    // Device to Device.
+    return ihipMemcpyDtoDCommand(command, pCopy->srcDevice, pCopy->dstDevice, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+  } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) {
+    // Host to Image.
+    return ihipMemcpyHtoACommand(command, pCopy->srcHost, pCopy->dstArray, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
+                                 queue);
+  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) {
+    // Image to Host.
+    return ihipMemcpyAtoHCommand(command, pCopy->srcArray, pCopy->dstHost, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight,
+                                 queue);
+  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) {
+    // Device to Image.
+    return ihipMemcpyDtoACommand(command, pCopy->srcDevice, pCopy->dstArray, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
+                                 queue);
+  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) {
+    // Image to Device.
+    return ihipMemcpyAtoDCommand(command, pCopy->srcArray, pCopy->dstDevice, srcOrigin, dstOrigin,
+                                 copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight,
+                                 queue);
+  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) {
+    // Image to Image.
+    return ihipMemcpyAtoACommand(command, pCopy->srcArray, pCopy->dstArray, srcOrigin, dstOrigin,
+                                 copyRegion, queue);
+  } else {
+    ShouldNotReachHere();
+  }
+
+  return hipSuccess;
+}
+
+inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = false) {
+  hipError_t status = hipSuccess;
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  command->enqueue();
+  if (!isAsync) {
+    if (!command->awaitCompletion()) {
+      status = hipErrorUnknown;
+    }
+  }
+  command->release();
+  return status;
+}
+
+hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status;
+  if (pCopy->WidthInBytes == 0 || pCopy->Height == 0 || pCopy->Depth == 0) {
+    LogPrintfInfo("Either Width :%d or Height: %d and Depth: %d is zero", pCopy->WidthInBytes,
+                  pCopy->Height, pCopy->Depth);
+    return hipSuccess;
+  }
+  // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the (unified virtual address space)
+  // base address of the source data and the bytes per row to apply. {src/dst}Array is ignored.
+  hipMemoryType srcMemoryType = pCopy->srcMemoryType;
+  if (srcMemoryType == hipMemoryTypeUnified) {
+    srcMemoryType = amd::MemObjMap::FindMemObj(pCopy->srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system memory.
+      const_cast<HIP_MEMCPY3D*>(pCopy)->srcHost = pCopy->srcDevice;
+    }
+  }
+  hipMemoryType dstMemoryType = pCopy->dstMemoryType;
+  if (dstMemoryType == hipMemoryTypeUnified) {
+    dstMemoryType = amd::MemObjMap::FindMemObj(pCopy->dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstHost = pCopy->dstDevice;
+    }
+  }
+  // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned.
+  // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning.
+  if (srcMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->srcHost);
+    srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+  }
+  if (dstMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->dstHost);
+    dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+  }
+  if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeHost)) {
+    amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ};
+    amd::Coord3D dstOrigin = {pCopy->dstXInBytes, pCopy->dstY, pCopy->dstZ};
+    amd::Coord3D copyRegion = {pCopy->WidthInBytes, (pCopy->Height != 0) ? pCopy->Height : 1,
+                               (pCopy->Depth != 0) ? pCopy->Depth : 1};
+    // Host to Host.
+    return ihipMemcpyHtoH(pCopy->srcHost, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion,
+                          pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, pCopy->dstPitch,
+                          pCopy->dstPitch * pCopy->dstHeight);
+  } else {
+    status = ihipGetMemcpyParam3DCommand(command, pCopy, hip::getQueue(stream));
+    if (status != hipSuccess) return status;
+    return ihipMemcpyCmdEnqueue(command, isAsync);
+  }
+}
+
+hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy,
+                             hipStream_t stream,
+                             bool isAsync = false) {
+  HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*pCopy);
+
+  return ihipMemcpyParam3D(&desc, stream, isAsync);
+}
+
+hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                        size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
+  hip_Memcpy2D desc = {};
+  if (spitch == 0 || dpitch == 0) {
+    return hipErrorUnknown;
+  }
+  if (width == 0 || height == 0) {
+    return hipSuccess;
+  }
+
+  desc.srcXInBytes = 0;
+  desc.srcY = 0;
+  desc.srcMemoryType = std::get<0>(hip::getMemoryType(kind));
+  desc.srcHost = src;
+  desc.srcDevice = const_cast<void*>(src);
+  desc.srcArray = nullptr; // Ignored.
+  desc.srcPitch = spitch;
+
+  desc.dstXInBytes = 0;
+  desc.dstY = 0;
+  desc.dstMemoryType = std::get<1>(hip::getMemoryType(kind));
+  desc.dstHost = dst;
+  desc.dstDevice = dst;
+  desc.dstArray = nullptr; // Ignored.
+  desc.dstPitch = dpitch;
+
+  desc.WidthInBytes = width;
+  desc.Height = height;
+
+  return ihipMemcpyParam2D(&desc, stream, isAsync);
+}
+
+hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
+  HIP_INIT_API(hipMemcpyParam2D, pCopy);
+
+  HIP_RETURN_DURATION(ihipMemcpyParam2D(pCopy, nullptr));
+}
+
+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                       size_t height, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind);
+
+  HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr));
+}
+
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                            size_t height, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpy2DAsync, dst, dpitch, src, spitch, width, height, kind, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true));
+}
+
+hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
+  if (dst == nullptr) {
+    HIP_RETURN(hipErrorInvalidResourceHandle);
+  }
+
+  hip_Memcpy2D desc = {};
+
+  desc.srcXInBytes = 0;
+  desc.srcY = 0;
+  desc.srcMemoryType = std::get<0>(hip::getMemoryType(kind));
+  desc.srcHost = const_cast<void*>(src);
+  desc.srcDevice = const_cast<void*>(src);
+  desc.srcArray = nullptr;
+  desc.srcPitch = spitch;
+
+  desc.dstXInBytes = wOffset;
+  desc.dstY = hOffset;
+  desc.dstMemoryType = hipMemoryTypeArray;
+  desc.dstHost = nullptr;
+  desc.dstDevice = nullptr;
+  desc.dstArray = dst;
+  desc.dstPitch = 0; // Ignored.
+
+  desc.WidthInBytes = width;
+  desc.Height = height;
+
+  return ihipMemcpyParam2D(&desc, stream, isAsync);
+}
+
+hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind);
+
+  if (spitch == 0) {
+    HIP_RETURN(hipErrorInvalidPitchValue);
+  }
+
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr));
+}
+
+hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpyToArray, dst, wOffset, hOffset, src, count, kind);
+
+  if (dst == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const size_t arrayHeight = (dst->height != 0) ? dst->height : 1;
+  const size_t witdthInBytes = count / arrayHeight;
+
+  const size_t height = (count / dst->width) / hip::getElementSize(dst);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr));
+}
+
+hipError_t ihipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
+  if (src == nullptr) {
+    HIP_RETURN(hipErrorInvalidResourceHandle);
+  }
+
+  hip_Memcpy2D desc = {};
+
+  desc.srcXInBytes = wOffsetSrc;
+  desc.srcY = hOffsetSrc;
+  desc.srcMemoryType = hipMemoryTypeArray;
+  desc.srcHost = nullptr;
+  desc.srcDevice = nullptr;
+  desc.srcArray = const_cast<hipArray_t>(src);
+  desc.srcPitch = 0; // Ignored.
+
+  desc.dstXInBytes = 0;
+  desc.dstY = 0;
+  desc.dstMemoryType = std::get<1>(hip::getMemoryType(kind));
+  desc.dstHost = dst;
+  desc.dstDevice = dst;
+  desc.dstArray = nullptr;
+  desc.dstPitch = dpitch;
+
+  desc.WidthInBytes = width;
+  desc.Height = height;
+
+  return ihipMemcpyParam2D(&desc, stream, isAsync);
+}
+
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t count, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpyFromArray, dst, src, wOffsetSrc, hOffset, count, kind);
+
+  if (src == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const size_t arrayHeight = (src->height != 0) ? src->height : 1;
+  const size_t witdthInBytes = count / arrayHeight;
+
+  const size_t height = (count / src->width) / hip::getElementSize(src);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr));
+}
+
+hipError_t ihipMemcpyAtoD(hipArray* srcArray, void* dstDevice, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch,
+                          size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status =
+      ihipMemcpyAtoDCommand(command, srcArray, dstDevice, srcOrigin, dstOrigin, copyRegion,
+                               dstRowPitch, dstSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyDtoA(void* srcDevice, hipArray* dstArray, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status =
+      ihipMemcpyDtoACommand(command, srcDevice, dstArray, srcOrigin, dstOrigin, copyRegion,
+                               srcRowPitch, srcSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyDtoD(void* srcDevice, void* dstDevice, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
+                          hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status = ihipMemcpyDtoDCommand(command, srcDevice, dstDevice, srcOrigin, dstOrigin,
+                                               copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
+                                               dstSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyDtoH(void* srcDevice, void* dstHost, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
+                          hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status = ihipMemcpyDtoHCommand(command, srcDevice, dstHost, srcOrigin, dstOrigin,
+                                               copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
+                                               dstSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyHtoD(const void* srcHost, void* dstDevice, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
+                          hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status = ihipMemcpyHtoDCommand(command, srcHost, dstDevice, srcOrigin, dstOrigin,
+                                               copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
+                                               dstSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyAtoA(hipArray* srcArray, hipArray* dstArray, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, hipStream_t stream,
+                          bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status = ihipMemcpyAtoACommand(command, srcArray, dstArray, srcOrigin, dstOrigin,
+                                               copyRegion, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyHtoA(const void* srcHost, hipArray* dstArray, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
+                          size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status =
+      ihipMemcpyHtoACommand(command, srcHost, dstArray, srcOrigin, dstOrigin, copyRegion,
+                               srcRowPitch, srcSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+hipError_t ihipMemcpyAtoH(hipArray* srcArray, void* dstHost, amd::Coord3D srcOrigin,
+                          amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch,
+                          size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) {
+  amd::Command* command;
+  hipError_t status =
+      ihipMemcpyAtoHCommand(command, srcArray, dstHost, srcOrigin, dstOrigin, copyRegion,
+                               dstRowPitch, dstSlicePitch, hip::getQueue(stream));
+  if (status != hipSuccess) return status;
+  return ihipMemcpyCmdEnqueue(command, isAsync);
+}
+
+hipError_t hipMemcpyHtoA(hipArray* dstArray,
+                         size_t dstOffset,
+                         const void* srcHost,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyHtoA, dstArray, dstOffset, srcHost, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+}
+
+hipError_t hipMemcpyAtoH(void* dstHost,
+                         hipArray* srcArray,
+                         size_t srcOffset,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyAtoH, dstHost, srcArray, srcOffset, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+}
+
+hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p) {
+  // The struct passed to hipMemcpy3D() must specify one of srcArray or srcPtr and one of dstArray
+  // or dstPtr. Passing more than one non-zero source or destination will cause hipMemcpy3D() to
+  // return an error.
+  if (p == nullptr || ((p->srcArray != nullptr) && (p->srcPtr.ptr != nullptr)) ||
+      ((p->dstArray != nullptr) && (p->dstPtr.ptr != nullptr))) {
+    return hipErrorInvalidValue;
+  }
+
+  // If the source and destination are both arrays, hipMemcpy3D() will return an error if they do
+  // not have the same element size.
+  if (((p->srcArray != nullptr) && (p->dstArray != nullptr)) &&
+      (hip::getElementSize(p->dstArray) != hip::getElementSize(p->dstArray))) {
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p,
+                               amd::HostQueue* queue) {
+  const HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*p);
+  return ihipGetMemcpyParam3DCommand(command, &desc, queue);
+}
+
+hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream, bool isAsync = false) {
+  hipError_t status = ihipMemcpy3D_validate(p);
+  if (status != hipSuccess) {
+    return status;
+  }
+  const HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*p);
+
+  return ihipMemcpyParam3D(&desc, stream, isAsync);
+}
+
+hipError_t hipMemcpy3D(const hipMemcpy3DParms* p) {
+  HIP_INIT_API(hipMemcpy3D, p);
+
+  HIP_RETURN_DURATION(ihipMemcpy3D(p, nullptr));
+}
+
+hipError_t hipMemcpy3DAsync(const hipMemcpy3DParms* p, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpy3DAsync, p, stream);
+
+  STREAM_CAPTURE(hipMemcpy3DAsync, stream, p);
+
+  HIP_RETURN_DURATION(ihipMemcpy3D(p, stream, true));
+}
+
+hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
+  HIP_INIT_API(hipDrvMemcpy3D, pCopy);
+
+  HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, nullptr));
+}
+
+hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
+  HIP_INIT_API(hipDrvMemcpy3DAsync, pCopy, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, stream, true));
+}
+
+hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, size_t offset,
+                                 int64_t value, size_t valueSize, size_t sizeBytes,
+                                 amd::HostQueue* queue) {
+  if ((memory == nullptr) || (queue == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Command::EventWaitList waitList;
+  amd::Coord3D fillOffset(offset, 0, 0);
+  amd::Coord3D fillSize(sizeBytes, 1, 1);
+  command =
+      new amd::FillMemoryCommand(*queue, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(),
+                                 &value, valueSize, fillOffset, fillSize);
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize,
+                                      size_t sizeBytes) {
+  if (sizeBytes == 0) {
+    // Skip if nothing needs filling.
+    return hipSuccess;
+  }
+
+  if (dst == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  size_t offset = 0;
+  amd::Memory* memory = getMemoryObject(dst, offset);
+  if (memory == nullptr) {
+    // dst ptr is host ptr hence error
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemsetCommand(std::vector<amd::Command*>& commands, void* dst, int64_t value,
+                             size_t valueSize, size_t sizeBytes, amd::HostQueue* queue) {
+  hipError_t hip_error = hipSuccess;
+  auto aligned_dst = amd::alignUp(reinterpret_cast<address>(dst), sizeof(uint64_t));
+  size_t offset = 0;
+  amd::Memory* memory = getMemoryObject(dst, offset);
+  size_t n_head_bytes = 0;
+  size_t n_tail_bytes = 0;
+  int64_t value64 = 0;
+  amd::Command* command;
+
+  if (sizeBytes / sizeof(int64_t) > 0) {
+    n_head_bytes = static_cast<uint8_t*>(aligned_dst) - static_cast<uint8_t*>(dst);
+    n_tail_bytes = ((sizeBytes - n_head_bytes) % sizeof(int64_t));
+    offset = offset + n_head_bytes;
+    size_t n_bytes = sizeBytes - n_tail_bytes - n_head_bytes;
+    if (n_bytes > 0) {
+      if (valueSize == sizeof(int8_t)) {
+        value = value & 0xff;
+        value64 = ((value << 56) | (value << 48) | (value << 40) | (value << 32) | (value << 24) |
+                   (value << 16) | (value << 8) | (value));
+      } else if (valueSize == sizeof(int16_t)) {
+        value = value & 0xffff;
+        value64 = ((value << 48) | (value << 32) | (value << 16) | (value));
+      } else if (valueSize == sizeof(int32_t)) {
+        value = value & 0xffffffff;
+        value64 = ((value << 32) | (value));
+      } else if (valueSize == sizeof(int64_t)) {
+        value64 = value;
+      } else {
+        LogPrintfError("Unsupported Pattern size: %u \n", valueSize);
+        return hipErrorInvalidValue;
+      }
+      // If n_tail_bytes is != 0 then we will do a second fillBuffer Command
+      // on the same stream below, dont wait, do the first call async.
+      hip_error =
+          packFillMemoryCommand(command, memory, offset, value64, sizeof(int64_t), n_bytes, queue);
+      commands.push_back(command);
+    }
+    if (hip_error != hipSuccess) {
+      return hip_error;
+    }
+  } else {
+    n_head_bytes = sizeBytes;
+  }
+
+  if (n_head_bytes != 0) {
+    memory = getMemoryObject(dst, offset);
+    hip_error =
+        packFillMemoryCommand(command, memory, offset, value, valueSize, n_head_bytes, queue);
+    commands.push_back(command);
+  }
+
+  if (n_tail_bytes != 0) {
+    void* new_dst = (reinterpret_cast<address>(dst) + sizeBytes) - n_tail_bytes;
+    memory = getMemoryObject(new_dst, offset);
+    hip_error =
+        packFillMemoryCommand(command, memory, offset, value, valueSize, n_tail_bytes, queue);
+    commands.push_back(command);
+  }
+  return hip_error;
+}
+
+hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeBytes,
+                      hipStream_t stream, bool isAsync = false) {
+  hipError_t hip_error = hipSuccess;
+  hip_error = ihipMemset_validate(dst, value, valueSize, sizeBytes);
+  if (hip_error != hipSuccess) {
+    return hip_error;
+  }
+  std::vector<amd::Command*> commands;
+  amd::HostQueue* queue = hip::getQueue(stream);
+  hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, queue);
+  if (hip_error != hipSuccess) {
+    return hip_error;
+  }
+  for (auto command : commands) {
+    command->enqueue();
+    if (!isAsync) {
+      command->awaitCompletion();
+    }
+    command->release();
+  }
+  return hip_error;
+}
+
+hipError_t hipMemset(void* dst, int value, size_t sizeBytes) {
+  HIP_INIT_API(hipMemset, dst, value, sizeBytes);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int8_t), sizeBytes, nullptr));
+}
+
+hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream) {
+  HIP_INIT_API(hipMemsetAsync, dst, value, sizeBytes, stream);
+  size_t valueSize = sizeof(int8_t);
+  STREAM_CAPTURE(hipMemsetAsync, stream, dst, value, valueSize, sizeBytes);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int8_t), sizeBytes, stream, true));
+}
+
+hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) {
+  HIP_INIT_API(hipMemsetD8, dst, value, count);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int8_t), count * sizeof(int8_t), nullptr));
+}
+
+hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count,
+                            hipStream_t stream) {
+  HIP_INIT_API(hipMemsetD8Async, dst, value, count, stream);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int8_t), count * sizeof(int8_t), stream, true));
+}
+
+hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count) {
+  HIP_INIT_API(hipMemsetD16, dst, value, count);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int16_t), count * sizeof(int16_t), nullptr));
+}
+
+hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count,
+                             hipStream_t stream) {
+  HIP_INIT_API(hipMemsetD16Async, dst, value, count, stream);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int16_t), count * sizeof(int16_t), stream, true));
+}
+
+hipError_t hipMemsetD32(hipDeviceptr_t dst, int value, size_t count) {
+  HIP_INIT_API(hipMemsetD32, dst, value, count);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int32_t), count * sizeof(int32_t), nullptr));
+}
+
+hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
+                             hipStream_t stream) {
+  HIP_INIT_API(hipMemsetD32Async, dst, value, count, stream);
+
+  HIP_RETURN(ihipMemset(dst, value, sizeof(int32_t), count * sizeof(int32_t), stream, true));
+}
+
+hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
+                                        size_t sizeBytes) {
+  size_t offset = 0;
+  amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
+
+  if (memory == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  if (sizeBytes > memory->getSize()) {
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipMemset3DCommand(std::vector<amd::Command*> &commands, hipPitchedPtr pitchedDevPtr,
+                               int value, hipExtent extent, amd::HostQueue* queue) {
+  size_t offset = 0;
+  auto sizeBytes = extent.width * extent.height * extent.depth;
+  amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
+  if (pitchedDevPtr.pitch == extent.width) {
+    return ihipMemsetCommand(commands, pitchedDevPtr.ptr, value, sizeof(int8_t),
+                                  static_cast<size_t>(sizeBytes), queue);
+  }
+  // Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
+  // Fall back to filling one row at a time.
+  amd::Coord3D origin(offset);
+  amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
+  amd::BufferRect rect;
+  if (pitchedDevPtr.pitch == 0 ||
+      !rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region), pitchedDevPtr.pitch,
+                   0)) {
+    return hipErrorInvalidValue;
+  }
+  amd::FillMemoryCommand* command;
+  for (size_t slice = 0; slice < extent.depth; slice++) {
+    for (size_t row = 0; row < extent.height; row++) {
+      const size_t rowOffset = rect.offset(0, row, slice);
+      command = new amd::FillMemoryCommand(
+          *queue, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(),
+          &value, sizeof(int8_t), amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{extent.width, 1, 1});
+      commands.push_back(command);
+    }
+  }
+  return hipSuccess;
+}
+
+
+hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
+                        hipStream_t stream, bool isAsync = false) {
+  auto sizeBytes = extent.width * extent.height * extent.depth;
+
+  if (sizeBytes == 0) {
+    // sizeBytes is zero hence returning early as nothing to be set
+    return hipSuccess;
+  }
+  hipError_t status = ihipMemset3D_validate(pitchedDevPtr, value, extent, sizeBytes);
+  if (status != hipSuccess) {
+    return status;
+  }
+  amd::HostQueue* queue = hip::getQueue(stream);
+  std::vector<amd::Command*> commands;
+  ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, queue);
+  for (auto& command : commands) {
+    command->enqueue();
+    if (!isAsync) {
+      command->awaitCompletion();
+    }
+    command->release();
+  }
+  return hipSuccess;
+}
+
+hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
+  HIP_INIT_API(hipMemset2D, dst, pitch, value, width, height);
+
+  HIP_RETURN(ihipMemset3D({dst, pitch, width, height}, value, {width, height, 1}, nullptr));
+}
+
+hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value,
+                            size_t width, size_t height, hipStream_t stream) {
+  HIP_INIT_API(hipMemset2DAsync, dst, pitch, value, width, height, stream);
+
+  STREAM_CAPTURE(hipMemset2DAsync, stream, dst, pitch, value, width, height);
+
+  HIP_RETURN(ihipMemset3D({dst, pitch, width, height}, value, {width, height, 1}, stream, true));
+}
+
+hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent) {
+  HIP_INIT_API(hipMemset3D, pitchedDevPtr, value, extent);
+
+  HIP_RETURN(ihipMemset3D(pitchedDevPtr, value, extent, nullptr));
+}
+
+hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream) {
+  HIP_INIT_API(hipMemset3DAsync, pitchedDevPtr, value, extent, stream);
+
+  STREAM_CAPTURE(hipMemset3DAsync, stream, pitchedDevPtr, value, extent);
+
+  HIP_RETURN(ihipMemset3D(pitchedDevPtr, value, extent, stream, true));
+}
+
+hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes,
+                            size_t height, unsigned int elementSizeBytes) {
+  HIP_INIT_API(hipMemAllocPitch, dptr, pitch, widthInBytes, height, elementSizeBytes);
+
+  HIP_RETURN(hipMallocPitch(dptr, pitch, widthInBytes, height));
+}
+
+hipError_t hipMemAllocHost(void** ptr, size_t size) {
+  HIP_INIT_API(hipMemAllocHost, ptr, size);
+
+  HIP_RETURN_DURATION(hipHostMalloc(ptr, size, 0));
+}
+
+hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* dev_ptr) {
+  HIP_INIT_API(hipIpcGetMemHandle, handle, dev_ptr);
+
+  amd::Device* device = nullptr;
+  ihipIpcMemHandle_t* ihandle = nullptr;
+
+  if ((handle == nullptr) || (dev_ptr == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  device = hip::getCurrentDevice()->devices()[0];
+  ihandle = reinterpret_cast<ihipIpcMemHandle_t *>(handle);
+
+  if(!device->IpcCreate(dev_ptr, &(ihandle->psize), &(ihandle->ipc_handle), &(ihandle->poffset))) {
+    LogPrintfError("IPC memory creation failed for memory: 0x%x", dev_ptr);
+    HIP_RETURN(hipErrorInvalidDevicePointer);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipIpcOpenMemHandle(void** dev_ptr, hipIpcMemHandle_t handle, unsigned int flags) {
+  HIP_INIT_API(hipIpcOpenMemHandle, dev_ptr, &handle, flags);
+
+  amd::Memory* amd_mem_obj = nullptr;
+  amd::Device* device = nullptr;
+  ihipIpcMemHandle_t* ihandle = nullptr;
+
+  if (dev_ptr == nullptr || flags != hipIpcMemLazyEnablePeerAccess) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  /* Call the IPC Attach from Device class */
+  device = hip::getCurrentDevice()->devices()[0];
+  ihandle = reinterpret_cast<ihipIpcMemHandle_t *>(&handle);
+
+  if (ihandle->psize == 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if(!device->IpcAttach(&(ihandle->ipc_handle), ihandle->psize,
+                        ihandle->poffset, flags, dev_ptr)) {
+    LogPrintfError("Cannot attach ipc_handle: with ipc_size: %u"
+                      "ipc_offset: %u flags: %u", ihandle->psize, flags);
+    HIP_RETURN(hipErrorInvalidDevicePointer);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipIpcCloseMemHandle(void* dev_ptr) {
+  HIP_INIT_API(hipIpcCloseMemHandle, dev_ptr);
+
+  size_t offset = 0;
+  amd::Device* device = nullptr;
+  amd::Memory* amd_mem_obj = nullptr;
+
+  hip::getNullStream()->finish();
+
+  if (dev_ptr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  /* Call IPC Detach from Device class */
+  device = hip::getCurrentDevice()->devices()[0];
+  if (device == nullptr) {
+    HIP_RETURN(hipErrorNoDevice);
+  }
+
+  /* detach the memory */
+  if (!device->IpcDetach(dev_ptr)){
+     HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipHostGetDevicePointer(void** devicePointer, void* hostPointer, unsigned flags) {
+  HIP_INIT_API(hipHostGetDevicePointer, devicePointer, hostPointer, flags);
+
+  size_t offset = 0;
+
+  amd::Memory* memObj = getMemoryObject(hostPointer, offset);
+  if (!memObj) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+*devicePointer = reinterpret_cast<void*>(memObj->getDeviceMemory(*hip::getCurrentDevice()->devices()[0])->virtualAddress() + offset);
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
+  HIP_INIT_API(hipPointerGetAttributes, attributes, ptr);
+
+  if (attributes == nullptr || ptr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  size_t offset = 0;
+  amd::Memory* memObj = getMemoryObject(ptr, offset);
+  int device = 0;
+  memset(attributes, 0, sizeof(hipPointerAttribute_t));
+
+  if (memObj != nullptr) {
+    attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
+        memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
+    if (attributes->memoryType == hipMemoryTypeHost) {
+      if (memObj->getHostMem() != nullptr) {
+        attributes->hostPointer = static_cast<char*>(memObj->getHostMem()) + offset;
+      }
+      else {
+        attributes->hostPointer = static_cast<char*>(memObj->getSvmPtr()) + offset;
+      }
+    }
+
+    device::Memory* devMem = memObj->getDeviceMemory(*hip::getCurrentDevice()->devices()[0]);
+    //getDeviceMemory can fail, hence validate the sanity of the mem obtained
+    if (nullptr == devMem) {
+      DevLogPrintfError("getDeviceMemory for ptr failed : %p \n", ptr);
+      HIP_RETURN(hipErrorMemoryAllocation);
+    }
+
+    attributes->devicePointer = reinterpret_cast<char*>(devMem->virtualAddress() + offset);
+    constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
+    attributes->isManaged =
+        ((memObj->getMemFlags() & kManagedAlloc) == kManagedAlloc) ? true : false;
+    attributes->allocationFlags = memObj->getMemFlags() >> 16;
+
+    amd::Context* memObjCtx = &memObj->getContext();
+    if (hip::host_device->asContext() == memObjCtx) {
+        attributes->device = ihipGetDevice();
+        HIP_RETURN(hipSuccess);
+    }
+    for (auto& ctx : g_devices) {
+      if (ctx->asContext() == memObjCtx) {
+        attributes->device = device;
+        HIP_RETURN(hipSuccess);
+      }
+      ++device;
+    }
+    LogPrintfError("Cannot find memory object context, memObjCtx: 0x%x \n", memObjCtx);
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  LogPrintfError("Cannot get amd_mem_obj for ptr: 0x%x \n", ptr);
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+// ================================================================================================
+hipError_t hipArrayDestroy(hipArray* array) {
+  HIP_INIT_API(hipArrayDestroy, array);
+
+  HIP_RETURN(ihipArrayDestroy(array));
+}
+
+hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
+                                   hipArray* array) {
+  HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array);
+
+  assert(false && "Unimplemented");
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
+                                 hipArray* array) {
+  HIP_INIT_API(hipArrayGetDescriptor, pArrayDescriptor, array);
+
+  assert(false && "Unimplemented");
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy,
+                                 hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyParam2DAsync, pCopy);
+
+  HIP_RETURN(ihipMemcpyParam2D(pCopy, stream, true));
+}
+
+hipError_t ihipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
+  hip_Memcpy2D desc = {};
+
+  desc.srcXInBytes = wOffsetSrc;
+  desc.srcY = hOffsetSrc;
+  desc.srcMemoryType = hipMemoryTypeArray;
+  desc.srcHost = nullptr;
+  desc.srcDevice = nullptr;
+  desc.srcArray = const_cast<hipArray_t>(src);
+  desc.srcPitch = 0; // Ignored.
+
+  desc.dstXInBytes = wOffsetDst;
+  desc.dstY = hOffsetDst;
+  desc.dstMemoryType = hipMemoryTypeArray;
+  desc.dstHost = nullptr;
+  desc.dstDevice = nullptr;
+  desc.dstArray = dst;
+  desc.dstPitch = 0; // Ignored.
+
+  desc.WidthInBytes = width;
+  desc.Height = height;
+
+  return ihipMemcpyParam2D(&desc, stream, isAsync);
+}
+
+hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpy2DArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
+}
+
+hipError_t hipMemcpyArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpyArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
+}
+
+hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) {
+  HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind);
+
+  if (dpitch == 0) {
+    HIP_RETURN(hipErrorInvalidPitchValue);
+  }
+
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr));
+}
+
+hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpy2DFromArrayAsync, dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream);
+
+  if (dpitch == 0) {
+    HIP_RETURN(hipErrorInvalidPitchValue);
+  }
+
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true));
+}
+
+hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyFromArrayAsync, dst, src, wOffsetSrc, hOffsetSrc, count, kind, stream);
+
+  if (src == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const size_t arrayHeight = (src->height != 0) ? src->height : 1;
+  const size_t widthInBytes = count / arrayHeight;
+
+  const size_t height = (count / src->width) / hip::getElementSize(src);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true));
+}
+
+hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpy2DToArrayAsync, dst, wOffset, hOffset, src, spitch, width, height, kind);
+
+  if (spitch == 0) {
+    HIP_RETURN(hipErrorInvalidPitchValue);
+  }
+
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true));
+}
+
+hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyToArrayAsync, dst, wOffset, hOffset, src, count, kind);
+
+  if (dst == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const size_t arrayHeight = (dst->height != 0) ? dst->height : 1;
+  const size_t widthInBytes = count / arrayHeight;
+
+  const size_t height = (count / dst->width) / hip::getElementSize(dst);
+
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true));
+}
+
+hipError_t hipMemcpyAtoA(hipArray* dstArray,
+                         size_t dstOffset,
+                         hipArray* srcArray,
+                         size_t srcOffset,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyAtoA, dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr));
+}
+
+hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice,
+                         hipArray* srcArray,
+                         size_t srcOffset,
+                         size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyAtoD, dstDevice, srcArray, srcOffset, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+}
+
+hipError_t hipMemcpyAtoHAsync(void* dstHost,
+                              hipArray* srcArray,
+                              size_t srcOffset,
+                              size_t ByteCount,
+                              hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyAtoHAsync, dstHost, srcArray, srcOffset, ByteCount, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
+}
+
+hipError_t hipMemcpyDtoA(hipArray* dstArray,
+                        size_t dstOffset,
+                        hipDeviceptr_t srcDevice,
+                        size_t ByteCount) {
+  HIP_INIT_API(hipMemcpyDtoA, dstArray, dstOffset, srcDevice, ByteCount);
+
+  HIP_RETURN_DURATION(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+}
+
+hipError_t hipMemcpyHtoAAsync(hipArray* dstArray,
+                              size_t dstOffset,
+                              const void* srcHost,
+                              size_t ByteCount,
+                              hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyHtoAAsync, dstArray, dstOffset, srcHost, ByteCount, stream);
+
+  HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
+}
+
+hipError_t hipMallocHost(void** ptr,
+                         size_t size) {
+  HIP_INIT_API(hipMallocHost, ptr, size);
+
+  HIP_RETURN_DURATION(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), (ptr != nullptr)? *ptr : nullptr);
+}
+
+hipError_t hipFreeHost(void *ptr) {
+  HIP_INIT_API(hipFreeHost, ptr);
+
+  HIP_RETURN(ihipFree(ptr));
+}
+
+hipError_t hipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy) {
+  HIP_INIT_API(hipDrvMemcpy2DUnaligned, pCopy);
+
+  HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*pCopy);
+
+  HIP_RETURN(ihipMemcpyParam3D(&desc, nullptr));
+}
+
+hipError_t hipMallocMipmappedArray(hipMipmappedArray_t *mipmappedArray,
+                                   const hipChannelFormatDesc* desc,
+                                   hipExtent extent,
+                                   unsigned int numLevels,
+                                   unsigned int flags) {
+  HIP_INIT_API(hipMallocMipmappedArray, mipmappedArray, desc, extent, numLevels, flags);
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray) {
+  HIP_INIT_API(hipFreeMipmappedArray, mipmappedArray);
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipGetMipmappedArrayLevel(hipArray_t *levelArray,
+                                     hipMipmappedArray_const_t mipmappedArray,
+                                     unsigned int level) {
+  HIP_INIT_API(hipGetMipmappedArrayLevel, levelArray, mipmappedArray, level);
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t ihipMipmapArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr,
+                                 HIP_ARRAY3D_DESCRIPTOR* mipmapped_array_desc_ptr,
+                                 unsigned int num_mipmap_levels) {
+
+  const cl_channel_order channel_order = hip::getCLChannelOrder(
+                                           mipmapped_array_desc_ptr->NumChannels, 0);
+  const cl_channel_type channel_type = hip::getCLChannelType(mipmapped_array_desc_ptr->Format,
+                                                             hipReadModeElementType);
+  const cl_mem_object_type image_type = hip::getCLMemObjectType(mipmapped_array_desc_ptr->Width,
+                                                                mipmapped_array_desc_ptr->Height,
+                                                                mipmapped_array_desc_ptr->Depth,
+                                                                mipmapped_array_desc_ptr->Flags);
+
+  // Create a new amd::Image with mipmap
+  amd::Image* image = ihipImageCreate(channel_order,
+                                      channel_type,
+                                      image_type,
+                                      mipmapped_array_desc_ptr->Width,
+                                      mipmapped_array_desc_ptr->Height,
+                                      mipmapped_array_desc_ptr->Depth,
+                                      mipmapped_array_desc_ptr->Depth,
+                                      0 /* row pitch */,
+                                      0 /* slice pitch */,
+                                      num_mipmap_levels,
+                                      nullptr /* buffer */);
+
+  if (image == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  cl_mem cl_mem_obj = as_cl<amd::Memory>(image);
+  *mipmapped_array_pptr = new hipMipmappedArray();
+  (*mipmapped_array_pptr)->data = reinterpret_cast<void*>(cl_mem_obj);
+
+  (*mipmapped_array_pptr)->desc = hip::getChannelFormatDesc(
+                                    mipmapped_array_desc_ptr->NumChannels,
+                                    mipmapped_array_desc_ptr->Format);
+  (*mipmapped_array_pptr)->type = image_type;
+  (*mipmapped_array_pptr)->width = mipmapped_array_desc_ptr->Width;
+  (*mipmapped_array_pptr)->height = mipmapped_array_desc_ptr->Height;
+  (*mipmapped_array_pptr)->depth = mipmapped_array_desc_ptr->Depth;
+  (*mipmapped_array_pptr)->min_mipmap_level = 0;
+  (*mipmapped_array_pptr)->max_mipmap_level = num_mipmap_levels;
+  (*mipmapped_array_pptr)->flags = mipmapped_array_desc_ptr->Flags;
+  (*mipmapped_array_pptr)->format = mipmapped_array_desc_ptr->Format;
+
+  return hipSuccess;
+}
+
+hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) {
+
+  if (mipmapped_array_ptr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  cl_mem mem_obj = reinterpret_cast<cl_mem>(mipmapped_array_ptr->data);
+  if (is_valid(mem_obj) == false) {
+    return hipErrorInvalidValue;
+  }
+
+  for (auto& dev : g_devices) {
+    dev->NullStream()->finish();
+  }
+
+  as_amd(mem_obj)->release();
+
+  delete mipmapped_array_ptr;
+
+  return hipSuccess;
+}
+
+hipError_t ihipMipmappedArrayGetLevel(hipArray_t* level_array_pptr,
+                                     hipMipmappedArray_t mipmapped_array_ptr,
+                                     unsigned int mip_level) {
+
+  if (level_array_pptr == nullptr || mipmapped_array_ptr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  // Convert the raw data to amd::Image
+  cl_mem cl_mem_obj = reinterpret_cast<cl_mem>(mipmapped_array_ptr->data);
+  if (is_valid(cl_mem_obj) == false) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::Image* image = as_amd(cl_mem_obj)->asImage();
+  if (image == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  // Create new hip Array parameter and create an image view with new mip level.
+  (*level_array_pptr) = new hipArray();
+  (*level_array_pptr)->data = as_cl<amd::Memory>(image->createView(image->getContext(),
+                                                                   image->getImageFormat(),
+                                                                   NULL, mip_level, 0));
+
+  // Copy the new width, height & depth details of the flag to hipArray.
+  cl_mem cl_mip_mem_obj = reinterpret_cast<cl_mem>((*level_array_pptr)->data);
+  if (is_valid(cl_mem_obj) == false) {
+    return hipErrorInvalidValue;
+  }
+
+  // Fill the hip_array info from newly created amd::Image's view
+  amd::Image* mipmap_image = as_amd(cl_mip_mem_obj)->asImage();
+  (*level_array_pptr)->width = mipmap_image->getWidth();
+  (*level_array_pptr)->height = mipmap_image->getHeight();
+  (*level_array_pptr)->depth = mipmap_image->getDepth();
+
+  const cl_mem_object_type image_type = hip::getCLMemObjectType((*level_array_pptr)->width,
+                                                                (*level_array_pptr)->height,
+                                                                (*level_array_pptr)->depth,
+                                                                 mipmapped_array_ptr->flags);
+  (*level_array_pptr)->type = image_type;
+  (*level_array_pptr)->Format = mipmapped_array_ptr->format;
+  (*level_array_pptr)->desc = mipmapped_array_ptr->desc;
+  (*level_array_pptr)->NumChannels = hip::getNumChannels((*level_array_pptr)->desc);
+  (*level_array_pptr)->isDrv = 0;
+  (*level_array_pptr)->textureType = 0;
+
+  return hipSuccess;
+}
+
+hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr,
+                                   HIP_ARRAY3D_DESCRIPTOR* mipmapped_array_desc_ptr,
+                                   unsigned int num_mipmap_levels) {
+  HIP_INIT_API(hipMipmappedArrayCreate, mipmapped_array_pptr, mipmapped_array_desc_ptr,
+               num_mipmap_levels);
+
+  HIP_RETURN(ihipMipmapArrayCreate(mipmapped_array_pptr, mipmapped_array_desc_ptr,
+                                   num_mipmap_levels));
+}
+
+hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) {
+  HIP_INIT_API(hipMipmappedArrayDestroy, mipmapped_array_ptr);
+
+  HIP_RETURN(ihipMipmappedArrayDestroy(mipmapped_array_ptr));
+}
+
+hipError_t hipMipmappedArrayGetLevel(hipArray_t* level_array_pptr,
+                                     hipMipmappedArray_t mipmapped_array_ptr,
+                                     unsigned int mip_level) {
+  HIP_INIT_API(hipMipmappedArrayGetLevel, level_array_pptr, mipmapped_array_ptr, mip_level);
+
+  HIP_RETURN(ihipMipmappedArrayGetLevel(level_array_pptr, mipmapped_array_ptr, mip_level));
+}
+
diff --git a/rocclr/hip_module.cpp b/rocclr/hip_module.cpp
new file mode 100755
index 0000000000..464473b1a8
--- /dev/null
+++ b/rocclr/hip_module.cpp
@@ -0,0 +1,672 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include <elf/elf.hpp>
+#include <fstream>
+
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+#include "hip_event.hpp"
+#include "hip_platform.hpp"
+
+hipError_t ihipModuleLoadData(hipModule_t* module, const void* mmap_ptr, size_t mmap_size);
+
+extern hipError_t ihipLaunchKernel(const void* hostFunction,
+                                         dim3 gridDim,
+                                         dim3 blockDim,
+                                         void** args,
+                                         size_t sharedMemBytes,
+                                         hipStream_t stream,
+                                         hipEvent_t startEvent,
+                                         hipEvent_t stopEvent,
+                                         int flags);
+
+const std::string& FunctionName(const hipFunction_t f) {
+  return hip::DeviceFunc::asFunction(f)->kernel()->name();
+}
+
+static uint64_t ElfSize(const void *emi)
+{
+  return amd::Elf::getElfSize(emi);
+}
+
+hipError_t hipModuleUnload(hipModule_t hmod) {
+  HIP_INIT_API(hipModuleUnload, hmod);
+
+  HIP_RETURN(PlatformState::instance().unloadModule(hmod));
+}
+
+hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
+  HIP_INIT_API(hipModuleLoad, module, fname);
+
+  HIP_RETURN(PlatformState::instance().loadModule(module, fname));
+}
+
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
+{
+  HIP_INIT_API(hipModuleLoadData, module, image);
+
+  HIP_RETURN(PlatformState::instance().loadModule(module, 0, image));
+}
+
+hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
+                                  unsigned int numOptions, hipJitOption* options,
+                                  void** optionsValues)
+{
+  /* TODO: Pass options to Program */
+  HIP_INIT_API(hipModuleLoadDataEx, module, image);
+
+  HIP_RETURN(PlatformState::instance().loadModule(module, 0, image));
+}
+
+extern hipError_t __hipExtractCodeObjectFromFatBinary(const void* data,
+                                                const std::vector<std::string>& devices,
+                                                std::vector<std::pair<const void*, size_t>>& code_objs);
+
+hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name) {
+  HIP_INIT_API(hipModuleGetFunction, hfunc, hmod, name);
+
+  if(hfunc == nullptr || name == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (hipSuccess != PlatformState::instance().getDynFunc(hfunc, hmod, name)) {
+    LogPrintfError("Cannot find the function: %s for module: 0x%x \n", name, hmod);
+    HIP_RETURN(hipErrorNotFound);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod, const char* name)
+{
+  HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name);
+
+  if(dptr == nullptr || bytes == nullptr || name == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  /* Get address and size for the global symbol */
+  if (hipSuccess != PlatformState::instance().getDynGlobalVar(name, hmod, dptr, bytes)) {
+    LogPrintfError("Cannot find global Var: %s for module: 0x%x at device: %d \n", name, hmod,
+                   ihipGetDevice());
+    HIP_RETURN(hipErrorNotFound);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+  HIP_INIT_API(hipFuncGetAttribute, value, attrib, hfunc);
+
+  if ((value == nullptr) || (hfunc == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(hfunc);
+  if (function == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  amd::Kernel* kernel = function->kernel();
+  if (kernel == nullptr) {
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
+
+  const device::Kernel::WorkGroupInfo* wrkGrpInfo
+    = kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0]))->workGroupInfo();
+  if (wrkGrpInfo == nullptr) {
+    HIP_RETURN(hipErrorMissingConfiguration);
+  }
+
+  switch(attrib) {
+    case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
+      *value = static_cast<int>(wrkGrpInfo->localMemSize_);
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
+      *value = static_cast<int>(wrkGrpInfo->size_);
+      break;
+    case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
+      *value = 0;
+      break;
+    case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
+      *value = static_cast<int>(wrkGrpInfo->privateMemSize_);
+      break;
+    case HIP_FUNC_ATTRIBUTE_NUM_REGS:
+      *value = static_cast<int>(wrkGrpInfo->usedVGPRs_);
+      break;
+    case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
+      *value = 30; // Defaults to 3.0 as HCC
+      break;
+    case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
+      *value = static_cast<int>(kernel->signature().version());
+      break;
+    case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
+      *value = 0;
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
+      *value = static_cast<int>(wrkGrpInfo->availableLDSSize_ - wrkGrpInfo->localMemSize_);
+      break;
+    case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
+      *value = 0;
+      break;
+     default:
+       HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
+{
+  HIP_INIT_API(hipFuncGetAttributes, attr, func);
+
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatFuncAttr(attr, func, ihipGetDevice()));
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipFuncSetAttribute ( const void* func, hipFuncAttribute attr, int value ) {
+  HIP_INIT_API(hipFuncSetAttribute, func, attr, value);
+
+  // No way to set function attribute yet.
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig) {
+
+  HIP_INIT_API(hipFuncSetCacheConfig, cacheConfig);
+
+  // No way to set cache config yet.
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipFuncSetSharedMemConfig ( const void* func, hipSharedMemConfig config) {
+  HIP_INIT_API(hipFuncSetSharedMemConfig, func, config);
+
+  // No way to set Shared Memory config function yet.
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
+                                            uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                            uint32_t blockDimX, uint32_t blockDimY,
+                                            uint32_t blockDimZ, uint32_t sharedMemBytes,
+                                            void** kernelParams, void** extra, int deviceId,
+                                            uint32_t params = 0) {
+  if (f == nullptr) {
+    LogPrintfError("%s", "Function passed is null");
+    return hipErrorInvalidImage;
+  }
+  if ((kernelParams != nullptr) && (extra != nullptr)) {
+    LogPrintfError("%s",
+                   "Both, kernelParams and extra Params are provided, only one should be provided");
+    return hipErrorInvalidValue;
+  }
+  if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0 || blockDimX == 0 ||
+      blockDimY == 0 || blockDimZ == 0) {
+    return hipErrorInvalidValue;
+  }
+
+  if (extra != nullptr) {
+    if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE ||
+        extra[4] != HIP_LAUNCH_PARAM_END) {
+      return hipErrorNotInitialized;
+    }
+  }
+
+  const amd::Device* device = g_devices[deviceId]->devices()[0];
+  // Make sure dispatch doesn't exceed max workgroup size limit
+  if (blockDimX * blockDimY * blockDimZ > device->info().maxWorkGroupSize_) {
+    return hipErrorInvalidConfiguration;
+  }
+  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f);
+  amd::Kernel* kernel = function->kernel();
+  // Make sure the launch params are not larger than if specified launch_bounds
+  // If it exceeds, then return a failure
+  if (blockDimX * blockDimY * blockDimZ >
+      kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
+    LogPrintfError("%s", "Launch params are larger than launch bounds");
+    return hipErrorLaunchFailure;
+  }
+
+  if (params & amd::NDRangeKernelCommand::CooperativeGroups) {
+    if (!device->info().cooperativeGroups_) {
+      return hipErrorLaunchFailure;
+    }
+    int num_blocks = 0;
+    int max_blocks_per_grid = 0;
+    int best_block_size = 0;
+    int block_size = blockDimX * blockDimY * blockDimZ;
+    hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, &max_blocks_per_grid,
+                                                            &best_block_size, *device, f,
+                                                            block_size, sharedMemBytes, true);
+    if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) >
+        unsigned(max_blocks_per_grid)) {
+      return hipErrorCooperativeLaunchTooLarge;
+    }
+  }
+  if (params & amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups) {
+    if (!device->info().cooperativeMultiDeviceGroups_) {
+      return hipErrorLaunchFailure;
+    }
+  }
+  address kernargs = nullptr;
+  // 'extra' is a struct that contains the following info: {
+  //   HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs,
+  //   HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size,
+  //   HIP_LAUNCH_PARAM_END }
+  if (extra != nullptr) {
+    if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE ||
+        extra[4] != HIP_LAUNCH_PARAM_END) {
+      return hipErrorNotInitialized;
+    }
+    kernargs = reinterpret_cast<address>(extra[1]);
+  }
+
+  const amd::KernelSignature& signature = kernel->signature();
+  for (size_t i = 0; i < signature.numParameters(); ++i) {
+    const amd::KernelParameterDescriptor& desc = signature.at(i);
+    if (kernelParams == nullptr) {
+      assert(kernargs != nullptr);
+      kernel->parameters().set(i, desc.size_, kernargs + desc.offset_,
+                               desc.type_ == T_POINTER /*svmBound*/);
+    } else {
+      assert(extra == nullptr);
+      kernel->parameters().set(i, desc.size_, kernelParams[i],
+                               desc.type_ == T_POINTER /*svmBound*/);
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
+                                   uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
+                                   uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
+                                   uint32_t blockDimZ, uint32_t sharedMemBytes,
+                                   amd::HostQueue* queue, void** kernelParams, void** extra,
+                                   hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr,
+                                   uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
+                                   uint32_t numGrids = 0, uint64_t prevGridSum = 0,
+                                   uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
+  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f);
+  amd::Kernel* kernel = function->kernel();
+
+  size_t globalWorkOffset[3] = {0};
+  size_t globalWorkSize[3] = {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ};
+  size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ};
+  amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
+  amd::Command::EventWaitList waitList;
+  bool profileNDRange = false;
+  address kernargs = nullptr;
+
+  profileNDRange = (startEvent != nullptr || stopEvent != nullptr);
+
+  // Flag set to 1 signifies that kernel can be launched in anyorder
+  if (flags & hipExtAnyOrderLaunch) {
+    params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
+  }
+
+  amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
+      *queue, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
+      allGridSum, firstDevice, profileNDRange);
+  if (!kernelCommand) {
+    return hipErrorOutOfMemory;
+  }
+
+  // Capture the kernel arguments
+  if (CL_SUCCESS != kernelCommand->captureAndValidate()) {
+    delete kernelCommand;
+    return hipErrorOutOfMemory;
+  }
+  command = kernelCommand;
+  return hipSuccess;
+}
+
+hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                  uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
+                                  uint32_t sharedMemBytes, hipStream_t hStream, void** kernelParams,
+                                  void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
+                                  uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
+                                  uint32_t numGrids = 0, uint64_t prevGridSum = 0,
+                                  uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
+  HIP_INIT_API(ihipModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+               blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra,
+               startEvent, stopEvent, flags, params);
+
+  int deviceId = hip::Stream::DeviceId(hStream);
+  HIP_RETURN_ONFAIL(PlatformState::instance().initStatManagedVarDevicePtr(deviceId));
+  if (f == nullptr) {
+    LogPrintfError("%s", "Function passed is null");
+    return hipErrorInvalidImage;
+  }
+  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f);
+  amd::Kernel* kernel = function->kernel();
+  amd::ScopedLock lock(function->dflock_);
+
+  hipError_t status =
+      ihipLaunchKernel_validate(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
+                                blockDimY, blockDimZ, sharedMemBytes, kernelParams, extra, deviceId, params);
+  if (status != hipSuccess) {
+    return status;
+  }
+  amd::Command* command = nullptr;
+  amd::HostQueue* queue = hip::getQueue(hStream);
+  status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+                                   blockDimX, blockDimY, blockDimZ, sharedMemBytes, queue,
+                                   kernelParams, extra, startEvent, stopEvent, flags, params,
+                                   gridId, numGrids, prevGridSum, allGridSum, firstDevice);
+  if (status != hipSuccess) {
+    return status;
+  }
+
+  hip::Event* eStart = reinterpret_cast<hip::Event*>(startEvent);
+  hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
+  command->enqueue();
+
+  if (startEvent != nullptr) {
+    eStart->addMarker(queue, command, false);
+    command->retain();
+  }
+  if (stopEvent != nullptr) {
+    eStop->addMarker(queue, command, true);
+    command->retain();
+  }
+  command->release();
+
+  return hipSuccess;
+}
+
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
+                                 uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
+                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
+                                 uint32_t sharedMemBytes, hipStream_t hStream,
+                                 void **kernelParams, void **extra)
+{
+  HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ,
+               blockDimX, blockDimY, blockDimZ,
+               sharedMemBytes, hStream,
+               kernelParams, extra);
+  size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
+  size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
+  size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(f, static_cast<uint32_t>(globalWorkSizeX),
+                                static_cast<uint32_t>(globalWorkSizeY),
+                                static_cast<uint32_t>(globalWorkSizeZ),
+                                blockDimX, blockDimY, blockDimZ,
+                                sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr));
+}
+
+hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags)
+{
+  HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+               localWorkSizeX, localWorkSizeY, localWorkSizeZ,
+               sharedMemBytes, hStream,
+               kernelParams, extra, startEvent, stopEvent, flags);
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
+      localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
+}
+
+
+
+hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t blockDimX, uint32_t blockDimY,
+                                    uint32_t blockDimZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent,
+                                    hipEvent_t stopEvent)
+{
+  HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+               blockDimX, blockDimY, blockDimZ,
+               sharedMemBytes, hStream,
+               kernelParams, extra, startEvent, stopEvent);
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
+                                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
+}
+
+hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t blockDimX, uint32_t blockDimY,
+                                    uint32_t blockDimZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent,
+                                    hipEvent_t stopEvent)
+{
+  HIP_INIT_API(hipModuleLaunchKernelExt, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+               blockDimX, blockDimY, blockDimZ,
+               sharedMemBytes, hStream,
+               kernelParams, extra, startEvent, stopEvent);
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
+                                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
+}
+
+extern "C" hipError_t hipLaunchKernel(const void *hostFunction,
+                                      dim3 gridDim,
+                                      dim3 blockDim,
+                                      void** args,
+                                      size_t sharedMemBytes,
+                                      hipStream_t stream)
+{
+    HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
+    STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes);
+    HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream,
+                                nullptr, nullptr, 0));
+}
+
+extern "C" hipError_t hipExtLaunchKernel(const void* hostFunction,
+                                         dim3 gridDim,
+                                         dim3 blockDim,
+                                         void** args,
+                                         size_t sharedMemBytes,
+                                         hipStream_t stream,
+                                         hipEvent_t startEvent,
+                                         hipEvent_t stopEvent,
+                                         int flags)
+{
+    HIP_INIT_API(hipExtLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
+    HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, startEvent, stopEvent, flags));
+}
+
+hipError_t hipLaunchCooperativeKernel(const void* f,
+                                      dim3 gridDim, dim3 blockDim,
+                                      void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
+{
+  HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDim,
+               sharedMemBytes, hStream);
+
+  hipFunction_t func = nullptr;
+  int deviceId = hip::Stream::DeviceId(hStream);
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, f, deviceId));
+  size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
+  size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
+  size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+                                static_cast<uint32_t>(globalWorkSizeY),
+                                static_cast<uint32_t>(globalWorkSizeZ),
+                                blockDim.x, blockDim.y, blockDim.z,
+                                sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
+                                amd::NDRangeKernelCommand::CooperativeGroups));
+}
+
+hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                  int numDevices, unsigned int flags, uint32_t extFlags)
+{
+  int numActiveGPUs = 0;
+  ihipDeviceGetCount(&numActiveGPUs);
+
+  if ((numDevices > numActiveGPUs) || (launchParamsList == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  hipError_t result = hipErrorUnknown;
+  uint64_t allGridSize = 0;
+  uint32_t blockDims = 0;
+  std::vector<const amd::Device*> mgpu_list(numDevices);
+
+  for (int i = 0; i < numDevices; ++i) {
+    const hipLaunchParams& launch = launchParamsList[i];
+    blockDims = launch.blockDim.x * launch.blockDim.y * launch.blockDim.z;
+    allGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z *
+                   blockDims;
+
+    // Make sure block dimensions are valid
+    if (0 == blockDims) {
+      return hipErrorInvalidConfiguration;
+    }
+    if (launch.stream != nullptr) {
+      // Validate devices to make sure it dosn't have duplicates
+      amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.stream)->asHostQueue();
+      auto device = &queue->vdev()->device();
+      for (int j = 0; j < numDevices; ++j) {
+        if (mgpu_list[j] == device) {
+          return hipErrorInvalidDevice;
+        }
+      }
+      mgpu_list[i] = device;
+    } else {
+      return hipErrorInvalidResourceHandle;
+    }
+  }
+  uint64_t prevGridSize = 0;
+  uint32_t firstDevice = 0;
+
+  // Sync the execution streams on all devices
+  if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
+    for (int i = 0; i < numDevices; ++i) {
+      amd::HostQueue* queue =
+        reinterpret_cast<hip::Stream*>(launchParamsList[i].stream)->asHostQueue();
+      queue->finish();
+    }
+  }
+
+  for (int i = 0; i < numDevices; ++i) {
+    const hipLaunchParams& launch = launchParamsList[i];
+    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.stream)->asHostQueue();
+    hipFunction_t func = nullptr;
+    // The order of devices in the launch may not match the order in the global array
+    for (size_t dev = 0; dev < g_devices.size(); ++dev) {
+      // Find the matching device and request the kernel function
+      if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) {
+        IHIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, launch.func, dev));
+        // Save ROCclr index of the first device in the launch
+        if (i == 0) {
+          firstDevice = queue->vdev()->device().index();
+        }
+        break;
+      }
+    }
+    if (func == nullptr) {
+      result = hipErrorInvalidDeviceFunction;
+      HIP_RETURN(result);
+    }
+    size_t globalWorkSizeX = static_cast<size_t>(launch.gridDim.x) * launch.blockDim.x;
+    size_t globalWorkSizeY = static_cast<size_t>(launch.gridDim.y) * launch.blockDim.y;
+    size_t globalWorkSizeZ = static_cast<size_t>(launch.gridDim.z) * launch.blockDim.z;
+    if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+        globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+        globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+      HIP_RETURN(hipErrorInvalidConfiguration);
+    }
+    result = ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+      static_cast<uint32_t>(globalWorkSizeY), static_cast<uint32_t>(globalWorkSizeZ),
+      launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
+      launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr,
+      flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice);
+    if (result != hipSuccess) {
+      break;
+    }
+    prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
+  }
+
+  // Sync the execution streams on all devices
+  if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
+    for (int i = 0; i < numDevices; ++i) {
+      amd::HostQueue* queue =
+        reinterpret_cast<hip::Stream*>(launchParamsList[i].stream)->asHostQueue();
+      queue->finish();
+    }
+  }
+
+  return result;
+}
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int numDevices, unsigned int flags)
+{
+  HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
+
+  return HIP_RETURN(ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags,
+                                                (amd::NDRangeKernelCommand::CooperativeGroups |
+                                                 amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups)));
+}
+
+hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                              int numDevices, unsigned int flags) {
+  HIP_INIT_API(hipExtLaunchMultiKernelMultiDevice, launchParamsList, numDevices, flags);
+
+  return HIP_RETURN(ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, 0));
+}
+
+hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) {
+  HIP_INIT_API(hipModuleGetTexRef, texRef, hmod, name);
+
+  /* input args check */
+  if ((texRef == nullptr) || (name == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+   /* Get address and size for the global symbol */
+  if (hipSuccess != PlatformState::instance().getDynTexRef(name, hmod, texRef)) {
+    LogPrintfError("Cannot get texRef for name: %s at module:0x%x \n", name, hmod);
+    HIP_RETURN(hipErrorNotFound);
+  }
+
+  // Texture references created by HIP driver API
+  // have the default read mode set to normalized float.
+  (*texRef)->readMode = hipReadModeNormalizedFloat;
+
+  PlatformState::instance().registerTexRef(*texRef, hmod, std::string(name));
+
+  HIP_RETURN(hipSuccess);
+}
diff --git a/rocclr/hip_peer.cpp b/rocclr/hip_peer.cpp
new file mode 100755
index 0000000000..74c30fb411
--- /dev/null
+++ b/rocclr/hip_peer.cpp
@@ -0,0 +1,250 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) {
+  HIP_INIT_API(NONE, canAccessPeer, thisCtx, peerCtx);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipMemcpyPeer(void* dst, hipCtx_t dstCtx, const void* src, hipCtx_t srcCtx,
+                         size_t sizeBytes) {
+  HIP_INIT_API(NONE, dst, dstCtx, src, srcCtx, sizeBytes);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipMemcpyPeerAsync(void* dst, hipCtx_t dstDevice, const void* src, hipCtx_t srcDevice,
+                              size_t sizeBytes, hipStream_t stream) {
+  HIP_INIT_API(NONE, dst, dstDevice, src, srcDevice, sizeBytes, stream);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){
+  amd::Device* device = nullptr;
+  amd::Device* peer_device = nullptr;
+  if (canAccessPeer == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  /* Peer cannot be self */
+  if (deviceId == peerDeviceId) {
+    *canAccessPeer = 0;
+    return hipSuccess;
+  }
+  /* Cannot exceed the max number of devices */
+  if (static_cast<size_t>(deviceId) >= g_devices.size()
+       || static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
+    return hipErrorInvalidDevice;
+  }
+  device = g_devices[deviceId]->devices()[0];
+  peer_device = g_devices[peerDeviceId]->devices()[0];
+  *canAccessPeer = static_cast<int>(std::find(device->p2pDevices_.begin(),
+                                              device->p2pDevices_.end(), as_cl(peer_device))
+                                              != device->p2pDevices_.end());
+  return hipSuccess;
+}
+
+hipError_t findLinkInfo(int device1, int device2,
+                        std::vector<amd::Device::LinkAttrType>* link_attrs) {
+
+  amd::Device* amd_dev_obj1 = nullptr;
+  amd::Device* amd_dev_obj2 = nullptr;
+  const int numDevices = static_cast<int>(g_devices.size());
+
+  if ((device1 < 0) || (device1 >= numDevices) || (device2 < 0) || (device2 >= numDevices)) {
+    return hipErrorInvalidDevice;
+  }
+
+  amd_dev_obj1 = g_devices[device1]->devices()[0];
+  amd_dev_obj2 = g_devices[device2]->devices()[0];
+
+  if (!amd_dev_obj1->findLinkInfo(*amd_dev_obj2, link_attrs)) {
+    return hipErrorInvalidHandle;
+  }
+
+  return hipSuccess;
+}
+
+hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
+                                        uint32_t* linktype, uint32_t* hopcount) {
+  HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);
+
+  if (linktype == nullptr || hopcount == nullptr ||
+      device1 == device2  || device1 < 0 || device2 < 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  // Fill out the list of LinkAttributes
+  std::vector<amd::Device::LinkAttrType> link_attrs;
+  link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
+  link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkHopCount, 0));
+
+  HIP_RETURN_ONFAIL(findLinkInfo(device1, device2, &link_attrs));
+
+  *linktype = static_cast<uint32_t>(link_attrs[0].second);
+  *hopcount = static_cast<uint32_t>(link_attrs[1].second);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                    int srcDevice, int dstDevice) {
+  HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice);
+
+  if (value == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (srcDevice == dstDevice || srcDevice >= static_cast<int>(g_devices.size())
+      || dstDevice >= static_cast<int>(g_devices.size())) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  std::vector<amd::Device::LinkAttrType> link_attrs;
+
+  switch (attr) {
+    case hipDevP2PAttrPerformanceRank : {
+      link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
+      break;
+    }
+    case hipDevP2PAttrAccessSupported : {
+      HIP_RETURN_ONFAIL(canAccessPeer(value, srcDevice, dstDevice));
+      break;
+    }
+    case hipDevP2PAttrNativeAtomicSupported : {
+      link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
+      break;
+    }
+    case hipDevP2PAttrHipArrayAccessSupported : {
+      hipDeviceProp_t srcDeviceProp;
+      hipDeviceProp_t dstDeviceProp;
+      HIP_RETURN_ONFAIL(hipGetDeviceProperties(&srcDeviceProp, srcDevice));
+      HIP_RETURN_ONFAIL(hipGetDeviceProperties(&dstDeviceProp, dstDevice));
+
+      // Linear layout access is supported if P2P is enabled
+      // Opaque Images are supported only on homogeneous systems
+      // Might have more conditions to check, in future.
+      if (srcDeviceProp.gcnArch == dstDeviceProp.gcnArch) {
+        HIP_RETURN_ONFAIL(canAccessPeer(value, srcDevice, dstDevice));
+      } else {
+        value = 0;
+      }
+      break;
+    }
+    default : {
+      LogPrintfError("Invalid attribute attr: %d ", attr);
+      HIP_RETURN(hipErrorInvalidValue);
+      break;
+    }
+  }
+
+  if ((attr != hipDevP2PAttrAccessSupported) && (attr != hipDevP2PAttrHipArrayAccessSupported)) {
+    HIP_RETURN_ONFAIL(findLinkInfo(srcDevice, dstDevice, &link_attrs));
+    *value = static_cast<int>(link_attrs[0].second);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceCanAccessPeer(int* canAccess, int deviceId, int peerDeviceId) {
+  HIP_INIT_API(hipDeviceCanAccessPeer, canAccess, deviceId, peerDeviceId);
+  HIP_RETURN(canAccessPeer(canAccess, deviceId, peerDeviceId));
+}
+
+hipError_t hipDeviceDisablePeerAccess(int peerDeviceId) {
+  HIP_INIT_API(hipDeviceDisablePeerAccess, peerDeviceId);
+  int deviceId = hip::getCurrentDevice()->deviceId();
+  int canAccess = 0;
+  if ((hipSuccess != canAccessPeer(&canAccess, deviceId, peerDeviceId)) || (canAccess == 0)) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  amd::Device* device = g_devices[deviceId]->devices()[0];
+  amd::Device* peer_device = g_devices[peerDeviceId]->devices()[0];
+  peer_device->disableP2P(device);
+
+  HIP_RETURN(hip::getCurrentDevice()->DisablePeerAccess(peerDeviceId));
+}
+
+hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags) {
+  HIP_INIT_API(hipDeviceEnablePeerAccess, peerDeviceId, flags);
+  int deviceId = hip::getCurrentDevice()->deviceId();
+  int canAccess = 0;
+  if (flags != 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  if ((hipSuccess != canAccessPeer(&canAccess, deviceId, peerDeviceId)) || (canAccess == 0)) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  amd::Device* device = g_devices[deviceId]->asContext()->devices()[0];
+  amd::Device* peer_device = g_devices[peerDeviceId]->asContext()->devices()[0];
+  peer_device->enableP2P(device);
+
+  HIP_RETURN(hip::getCurrentDevice()->EnablePeerAccess(peerDeviceId));
+}
+
+hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
+                         size_t sizeBytes) {
+  HIP_INIT_API(hipMemcpyPeer, dst, dstDevice, src, srcDevice, sizeBytes);
+
+  if (srcDevice >= static_cast<int>(g_devices.size()) ||
+      dstDevice >= static_cast<int>(g_devices.size()) ||
+      srcDevice < 0 || dstDevice < 0) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  HIP_RETURN(hipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice));
+}
+
+hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice,
+                              size_t sizeBytes, hipStream_t stream) {
+  HIP_INIT_API(hipMemcpyPeerAsync, dst, dstDevice, src, srcDevice, sizeBytes, stream);
+
+  if (srcDevice >= static_cast<int>(g_devices.size()) ||
+      dstDevice >= static_cast<int>(g_devices.size()) ||
+      srcDevice < 0 || dstDevice < 0) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  HIP_RETURN(hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream));
+}
+
+hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
+  HIP_INIT_API(hipCtxEnablePeerAccess, peerCtx, flags);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
+  HIP_INIT_API(hipCtxDisablePeerAccess, peerCtx);
+
+  HIP_RETURN(hipSuccess);
+}
diff --git a/rocclr/hip_platform.cpp b/rocclr/hip_platform.cpp
new file mode 100755
index 0000000000..f99f4a30e5
--- /dev/null
+++ b/rocclr/hip_platform.cpp
@@ -0,0 +1,942 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include <hip/amd_detail/texture_types.h>
+#include "hip_platform.hpp"
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+#include "platform/runtime.hpp"
+
+#include <unordered_map>
+
+constexpr unsigned __hipFatMAGIC2 = 0x48495046; // "HIPF"
+
+thread_local std::stack<ihipExec_t> execStack_;
+PlatformState* PlatformState::platform_; // Initiaized as nullptr by default
+
+//forward declaration of methods required for __hipRegisrterManagedVar
+hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
+hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                      amd::HostQueue& queue, bool isAsync = false);
+
+struct __CudaFatBinaryWrapper {
+  unsigned int magic;
+  unsigned int version;
+  void*        binary;
+  void*        dummy1;
+};
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+    hipModule_t hmod, const char* name);
+
+hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
+                                  hipDeviceptr_t* dptr, size_t* bytes);
+
+extern hipError_t ihipModuleLaunchKernel(hipFunction_t f,
+                                 uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
+                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
+                                 uint32_t sharedMemBytes, hipStream_t hStream,
+                                 void **kernelParams, void **extra,
+                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
+                                 uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+                                 uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
+static bool isCompatibleCodeObject(const std::string& codeobj_target_id,
+                                   const char* device_name) {
+  // Workaround for device name mismatch.
+  // Device name may contain feature strings delimited by '+', e.g.
+  // gfx900+xnack. Currently HIP-Clang does not include feature strings
+  // in code object target id in fat binary. Therefore drop the feature
+  // strings from device name before comparing it with code object target id.
+  std::string short_name(device_name);
+  auto feature_loc = short_name.find('+');
+  if (feature_loc != std::string::npos) {
+    short_name.erase(feature_loc);
+  }
+  return codeobj_target_id == short_name;
+}
+
+extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
+{
+  const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
+  if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) {
+    LogPrintfError("Cannot Register fat binary. FatMagic: %u version: %u ", fbwrapper->magic,
+                   fbwrapper->version);
+    return nullptr;
+  }
+  return PlatformState::instance().addFatBinary(fbwrapper->binary);
+}
+
+extern "C" void __hipRegisterFunction(
+  hip::FatBinaryInfo** modules,
+  const void*  hostFunction,
+  char*        deviceFunction,
+  const char*  deviceName,
+  unsigned int threadLimit,
+  uint3*       tid,
+  uint3*       bid,
+  dim3*        blockDim,
+  dim3*        gridDim,
+  int*         wSize) {
+  static int enable_deferred_loading { []() {
+    char *var = getenv("HIP_ENABLE_DEFERRED_LOADING");
+    return var ? atoi(var) : 1;
+  }() };
+
+  hip::Function* func = new hip::Function(std::string(deviceName), modules);
+  PlatformState::instance().registerStatFunction(hostFunction, func);
+
+  if (!enable_deferred_loading) {
+    HIP_INIT();
+    hipFunction_t hfunc = nullptr;
+    hipError_t hip_error = hipSuccess;
+    for (size_t dev_idx = 0; dev_idx < g_devices.size(); ++dev_idx) {
+      hip_error = PlatformState::instance().getStatFunc(&hfunc, hostFunction, dev_idx);
+      guarantee((hip_error == hipSuccess), "Cannot Retrieve Static function");
+    }
+  }
+}
+
+// Registers a device-side global variable.
+// For each global variable in device code, there is a corresponding shadow
+// global variable in host code. The shadow host variable is used to keep
+// track of the value of the device side global variable between kernel
+// executions.
+extern "C" void __hipRegisterVar(
+  hip::FatBinaryInfo** modules,   // The device modules containing code object
+  void*       var,       // The shadow variable in host code
+  char*       hostVar,   // Variable name in host code
+  char*       deviceVar, // Variable name in device code
+  int         ext,       // Whether this variable is external
+  size_t      size,      // Size of the variable
+  int         constant,  // Whether this variable is constant
+  int         global)    // Unknown, always 0
+{
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Variable, size, 0, 0, modules);
+  PlatformState::instance().registerStatGlobalVar(var, var_ptr);
+}
+
+extern "C" void __hipRegisterSurface(hip::FatBinaryInfo** modules,      // The device modules containing code object
+                                     void* var,        // The shadow variable in host code
+                                     char* hostVar,    // Variable name in host code
+                                     char* deviceVar,  // Variable name in device code
+                                     int type, int ext) {
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Surface, sizeof(surfaceReference), 0, 0, modules);
+  PlatformState::instance().registerStatGlobalVar(var, var_ptr);
+}
+
+extern "C" void __hipRegisterManagedVar(void *hipModule,   // Pointer to hip module returned from __hipRegisterFatbinary
+                                        void **pointer,    // Pointer to a chunk of managed memory with size \p size and alignment \p align
+                                                           // HIP runtime allocates such managed memory and assign it to \p pointer
+                                        void *init_value,  // Initial value to be copied into \p pointer
+                                        const char *name,  // Name of the variable in code object
+                                        size_t size,
+                                        unsigned align) {
+  HIP_INIT();
+  hipError_t status = ihipMallocManaged(pointer, size, align);
+  if( status == hipSuccess) {
+    amd::HostQueue* queue = hip::getNullStream();
+    if(queue != nullptr) {
+      ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *queue);
+    } else {
+      ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
+    }
+  } else {
+    guarantee(false, "Error during allocation of managed memory!");
+  }
+  hip::Var* var_ptr = new hip::Var(std::string(name), hip::Var::DeviceVarKind::DVK_Managed, pointer,
+                                   size, align, reinterpret_cast<hip::FatBinaryInfo**>(hipModule));
+  PlatformState::instance().registerStatManagedVar(var_ptr);
+}
+
+extern "C" void __hipRegisterTexture(hip::FatBinaryInfo** modules,      // The device modules containing code object
+                                     void* var,        // The shadow variable in host code
+                                     char* hostVar,    // Variable name in host code
+                                     char* deviceVar,  // Variable name in device code
+                                     int type, int norm, int ext) {
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Texture, sizeof(textureReference), 0, 0, modules);
+  PlatformState::instance().registerStatGlobalVar(var, var_ptr);
+}
+
+extern "C" void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules)
+{
+  PlatformState::instance().removeFatBinary(modules);
+}
+
+extern "C" hipError_t hipConfigureCall(
+  dim3 gridDim,
+  dim3 blockDim,
+  size_t sharedMem,
+  hipStream_t stream)
+{
+  HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream);
+
+  PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
+
+  HIP_RETURN(hipSuccess);
+}
+
+extern "C" hipError_t __hipPushCallConfiguration(
+  dim3 gridDim,
+  dim3 blockDim,
+  size_t sharedMem,
+  hipStream_t stream)
+{
+  HIP_INIT_API(__hipPushCallConfiguration, gridDim, blockDim, sharedMem, stream);
+
+  PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
+
+  HIP_RETURN(hipSuccess);
+}
+
+extern "C" hipError_t __hipPopCallConfiguration(dim3 *gridDim,
+                                                dim3 *blockDim,
+                                                size_t *sharedMem,
+                                                hipStream_t *stream) {
+  HIP_INIT_API(__hipPopCallConfiguration, gridDim, blockDim, sharedMem, stream);
+
+  ihipExec_t exec;
+  PlatformState::instance().popExec(exec);
+  *gridDim = exec.gridDim_;
+  *blockDim = exec.blockDim_;
+  *sharedMem = exec.sharedMem_;
+  *stream = exec.hStream_;
+
+  HIP_RETURN(hipSuccess);
+}
+
+extern "C" hipError_t hipSetupArgument(
+  const void *arg,
+  size_t size,
+  size_t offset)
+{
+  HIP_INIT_API(hipSetupArgument, arg, size, offset);
+
+  PlatformState::instance().setupArgument(arg, size, offset);
+
+  HIP_RETURN(hipSuccess);
+}
+
+extern "C" hipError_t hipLaunchByPtr(const void *hostFunction)
+{
+  HIP_INIT_API(hipLaunchByPtr, hostFunction);
+
+  ihipExec_t exec;
+  PlatformState::instance().popExec(exec);
+
+  hip::Stream* stream = reinterpret_cast<hip::Stream*>(exec.hStream_);
+  int deviceId = (stream != nullptr)? stream->DeviceId() : ihipGetDevice();
+  if (deviceId == -1) {
+    LogPrintfError("Wrong DeviceId: %d \n", deviceId);
+    HIP_RETURN(hipErrorNoDevice);
+  }
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    LogPrintfError("Could not retrieve hostFunction: 0x%x \n", hostFunction);
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
+
+  size_t size = exec.arguments_.size();
+  void *extra[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec.arguments_[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+  HIP_RETURN(hipModuleLaunchKernel(func,
+    exec.gridDim_.x, exec.gridDim_.y, exec.gridDim_.z,
+    exec.blockDim_.x, exec.blockDim_.y, exec.blockDim_.z,
+    exec.sharedMem_, exec.hStream_, nullptr, extra));
+}
+
+hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) {
+  HIP_INIT_API(hipGetSymbolAddress, devPtr, symbol);
+
+  hipError_t hip_error = hipSuccess;
+  size_t sym_size = 0;
+
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size));
+
+  HIP_RETURN(hipSuccess, *devPtr);
+}
+
+hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {
+  HIP_INIT_API(hipGetSymbolSize, sizePtr, symbol);
+
+  hipDeviceptr_t device_ptr = nullptr;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr));
+
+  HIP_RETURN(hipSuccess, *sizePtr);
+}
+
+hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
+                                  hipDeviceptr_t* dptr, size_t* bytes)
+{
+  HIP_INIT();
+
+  amd::Program* program = nullptr;
+  device::Program* dev_program = nullptr;
+
+  /* Get Device Program pointer*/
+  program = as_amd(reinterpret_cast<cl_program>(hmod));
+  dev_program = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  if (dev_program == nullptr) {
+    LogPrintfError("Cannot get Device Function for module: 0x%x \n", hmod);
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
+  /* Find the global Symbols */
+  if (!dev_program->createGlobalVarObj(amd_mem_obj, dptr, bytes, name)) {
+    LogPrintfError("Cannot create Global Var obj for symbol: %s \n", name);
+    HIP_RETURN(hipErrorInvalidSymbol);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+
+namespace hip_impl {
+hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
+    const amd::Device& device, hipFunction_t func, int inputBlockSize,
+    size_t dynamicSMemSize, bool bCalcPotentialBlkSz)
+{
+  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
+  const amd::Kernel& kernel = *function->kernel();
+
+  const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel.getDeviceKernel(device)->workGroupInfo();
+  if (bCalcPotentialBlkSz == false) {
+    if (inputBlockSize <= 0) {
+      return hipErrorInvalidValue;
+    }
+    *bestBlockSize = 0;
+    // Make sure the requested block size is smaller than max supported
+    if (inputBlockSize > int(device.info().maxWorkGroupSize_)) {
+        *maxBlocksPerCU = 0;
+        *numBlocksPerGrid = 0;
+        return hipSuccess;
+    }
+  }
+  else {
+    if (inputBlockSize > int(device.info().maxWorkGroupSize_) ||
+            inputBlockSize <= 0) {
+      // The user wrote the kernel to work with a workgroup size
+      // bigger than this hardware can support. Or they do not care
+      // about the size So just assume its maximum size is
+      // constrained by hardware
+      inputBlockSize = device.info().maxWorkGroupSize_;
+    }
+  }
+  // Find wave occupancy per CU => simd_per_cu * GPR usage
+  size_t MaxWavesPerSimd;
+
+  if (device.isa().versionMajor() <= 9) {
+    MaxWavesPerSimd = 8;  // Limited by SPI 32 per CU, hence 8 per SIMD
+  } else {
+    MaxWavesPerSimd = 16;
+  }
+  size_t VgprWaves = MaxWavesPerSimd;
+  size_t maxVGPRs;
+  uint32_t VgprGranularity;
+  if (device.isa().versionMajor() <= 9) {
+    if (device.isa().versionMajor() == 9 &&
+        device.isa().versionMinor() == 0 &&
+        device.isa().versionStepping() == 10) {
+      maxVGPRs = 512;
+      VgprGranularity = 8;
+    }
+    else {
+      maxVGPRs = 256;
+      VgprGranularity = 4;
+    }
+  }
+  else {
+    maxVGPRs = 1024;
+    VgprGranularity = 8;
+  }
+  if (wrkGrpInfo->usedSGPRs_ > 0) {
+    VgprWaves = maxVGPRs / amd::alignUp(wrkGrpInfo->usedVGPRs_, VgprGranularity);
+  }
+
+  size_t GprWaves = VgprWaves;
+  if (wrkGrpInfo->usedSGPRs_ > 0) {
+    size_t maxSGPRs;
+    if (device.isa().versionMajor() < 8) {
+      maxSGPRs = 512;
+    }
+    else if (device.isa().versionMajor() < 10) {
+      maxSGPRs = 800;
+    }
+    else {
+      maxSGPRs = SIZE_MAX; // gfx10+ does not share SGPRs between waves
+    }
+    const size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
+    GprWaves = std::min(VgprWaves, SgprWaves);
+  }
+
+  const size_t alu_occupancy = device.info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves);
+  const int alu_limited_threads = alu_occupancy * wrkGrpInfo->wavefrontSize_;
+
+  int lds_occupancy_wgs = INT_MAX;
+  const size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
+  if (total_used_lds != 0) {
+    lds_occupancy_wgs = static_cast<int>(device.info().localMemSize_ / total_used_lds);
+  }
+  // Calculate how many blocks of inputBlockSize we can fit per CU
+  // Need to align with hardware wavefront size. If they want 65 threads, but
+  // waves are 64, then we need 128 threads per block.
+  // So this calculates how many blocks we can fit.
+  *maxBlocksPerCU = alu_limited_threads / amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_);
+  // Unless those blocks are further constrained by LDS size.
+  *maxBlocksPerCU = std::min(*maxBlocksPerCU, lds_occupancy_wgs);
+
+  // Some callers of this function want to return the block size, in threads, that
+  // leads to the maximum occupancy. In that case, inputBlockSize is the maximum
+  // workgroup size the user wants to allow, or that the hardware can allow.
+  // It is either the number of threads that we are limited to due to occupancy, or
+  // the maximum available block size for this kernel, which could have come from the
+  // user. e.g., if the user indicates the maximum block size is 64 threads, but we
+  // calculate that 128 threads can fit in each CU, we have to give up and return 64.
+  *bestBlockSize = std::min(alu_limited_threads, amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_));
+  // If the best block size is smaller than the block size used to fit the maximum,
+  // then we need to make the grid bigger for full occupancy.
+  const int bestBlocksPerCU = alu_limited_threads / (*bestBlockSize);
+  // Unless those blocks are further constrained by LDS size.
+  *numBlocksPerGrid = device.info().maxComputeUnits_ * std::min(bestBlocksPerCU, lds_occupancy_wgs);
+
+  return hipSuccess;
+}
+}
+
+extern "C" {
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             const void* f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit)
+{
+  HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, f, dynSharedMemPerBlk, blockSizeLimit);
+  if ((gridSize == nullptr) || (blockSize == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, f, ihipGetDevice());
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    return HIP_RETURN(hipErrorInvalidValue);
+  }
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+  int max_blocks_per_grid = 0;
+  int num_blocks = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit, dynSharedMemPerBlk,true);
+  if (ret == hipSuccess) {
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
+  }
+  HIP_RETURN(ret);
+}
+
+hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit)
+{
+  HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, f, dynSharedMemPerBlk, blockSizeLimit);
+  if ((gridSize == nullptr) || (blockSize == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+  int max_blocks_per_grid = 0;
+  int num_blocks = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
+  if (ret == hipSuccess) {
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
+  }
+  HIP_RETURN(ret);
+}
+
+hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit, unsigned int flags)
+{
+  HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, f, dynSharedMemPerBlk, blockSizeLimit, flags);
+  if ((gridSize == nullptr) || (blockSize == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+  int max_blocks_per_grid = 0;
+  int num_blocks = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
+  if (ret == hipSuccess) {
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
+  }
+  HIP_RETURN(ret);
+}
+
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                             hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
+{
+  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynSharedMemPerBlk);
+  if (numBlocks == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+
+  int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
+  *numBlocks = num_blocks;
+  HIP_RETURN(ret);
+}
+
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                              hipFunction_t f, int blockSize,
+                                                              size_t dynSharedMemPerBlk, unsigned int flags)
+{
+  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynSharedMemPerBlk, flags);
+  if (numBlocks == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+
+  int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
+  *numBlocks = num_blocks;
+  HIP_RETURN(ret);
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                        const void* f, int blockSize, size_t dynamicSMemSize)
+{
+  HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynamicSMemSize);
+  if (numBlocks == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, f, ihipGetDevice());
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    return HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+
+  int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
+  *numBlocks = num_blocks;
+  HIP_RETURN(ret);
+}
+
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                 const void* f,
+                                                                 int  blockSize, size_t dynamicSMemSize, unsigned int flags)
+{
+  HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynamicSMemSize, flags);
+  if (numBlocks == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, f, ihipGetDevice());
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    return HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
+
+  int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
+  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
+  *numBlocks = num_blocks;
+  HIP_RETURN(ret);
+}
+}
+
+
+#if defined(ATI_OS_LINUX)
+
+namespace hip_impl {
+
+void hipLaunchKernelGGLImpl(
+  uintptr_t function_address,
+  const dim3& numBlocks,
+  const dim3& dimBlocks,
+  uint32_t sharedMemBytes,
+  hipStream_t stream,
+  void** kernarg)
+{
+  HIP_INIT();
+
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice();
+  if (deviceId == -1) {
+    LogPrintfError("Wrong Device Id: %d \n", deviceId);
+  }
+
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, reinterpret_cast<void*>(function_address), deviceId);
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    LogPrintfError("Cannot find the static function: 0x%x", function_address);
+  }
+
+  hipModuleLaunchKernel(func,
+    numBlocks.x, numBlocks.y, numBlocks.z,
+    dimBlocks.x, dimBlocks.y, dimBlocks.z,
+    sharedMemBytes, stream, nullptr, kernarg);
+}
+
+void hipLaunchCooperativeKernelGGLImpl(
+  uintptr_t function_address,
+  const dim3& numBlocks,
+  const dim3& dimBlocks,
+  uint32_t sharedMemBytes,
+  hipStream_t stream,
+  void** kernarg)
+{
+  HIP_INIT();
+
+  hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address),
+    numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
+}
+
+}
+
+#endif // defined(ATI_OS_LINUX)
+
+hipError_t ihipLaunchKernel(const void* hostFunction,
+                                         dim3 gridDim,
+                                         dim3 blockDim,
+                                         void** args,
+                                         size_t sharedMemBytes,
+                                         hipStream_t stream,
+                                         hipEvent_t startEvent,
+                                         hipEvent_t stopEvent,
+                                         int flags)
+{
+  hipFunction_t func =  nullptr;
+  int deviceId = hip::Stream::DeviceId(stream);
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
+  size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
+  size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
+  size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+                                    static_cast<uint32_t>(globalWorkSizeY),
+                                    static_cast<uint32_t>(globalWorkSizeZ),
+                                    blockDim.x, blockDim.y, blockDim.z,
+                                    sharedMemBytes, stream, args, nullptr, startEvent, stopEvent,
+                                    flags));
+}
+
+// conversion routines between float and half precision
+
+static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; }
+
+static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; }
+
+static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); }
+
+
+// half float, the f16 is in the low 16 bits of the input argument
+
+static inline float __convert_half_to_float(std::uint32_t a) noexcept {
+
+  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
+
+  std::uint32_t v = f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U)/*0x1.0p+112f*/) + 0x38000000U;
+
+  u = (a & 0x7fff) != 0 ? v : u;
+
+  return u32_as_f32(u) * u32_as_f32(0x07800000U)/*0x1.0p-112f*/;
+
+}
+
+// float half with nearest even rounding
+// The lower 16 bits of the result is the bit pattern for the f16
+static inline std::uint32_t __convert_float_to_half(float a) noexcept {
+  std::uint32_t u = f32_as_u32(a);
+  int e = static_cast<int>((u >> 23) & 0xff) - 127 + 15;
+  std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0);
+  std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0);
+  std::uint32_t n = ((std::uint32_t)e << 12) | m;
+  std::uint32_t s = (u >> 16) & 0x8000;
+  int b = clamp_int(1-e, 0, 13);
+  std::uint32_t d = (0x1000 | m) >> b;
+  d |= (d << b) != (0x1000 | m);
+  std::uint32_t v = e < 1 ? d : n;
+  v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5));
+  v = e > 30 ? 0x7c00 : v;
+  v = e == 143 ? i : v;
+  return s | v;
+}
+
+extern "C"
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+float  __gnu_h2f_ieee(unsigned short h){
+  return __convert_half_to_float((std::uint32_t) h);
+}
+
+extern "C"
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+unsigned short  __gnu_f2h_ieee(float f){
+  return (unsigned short)__convert_float_to_half(f);
+}
+
+void PlatformState::init()
+{
+  amd::ScopedLock lock(lock_);
+  if(initialized_ || g_devices.empty()) {
+    return;
+  }
+  initialized_ = true;
+  for (auto& it : statCO_.modules_) {
+    digestFatBinary(it.first, it.second);
+  }
+  for (auto &it : statCO_.vars_) {
+    it.second->resize_dVar(g_devices.size());
+  }
+  for (auto &it : statCO_.functions_) {
+    it.second->resize_dFunc(g_devices.size());
+  }
+}
+
+hipError_t PlatformState::loadModule(hipModule_t *module, const char* fname, const void* image) {
+  amd::ScopedLock lock(lock_);
+
+  if(module == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  hip::DynCO* dynCo = new hip::DynCO();
+  hipError_t hip_error = dynCo->loadCodeObject(fname, image);
+  if (hip_error != hipSuccess) {
+    delete dynCo;
+    return hip_error;
+  }
+
+  *module = dynCo->module();
+  assert(*module != nullptr);
+
+  if (dynCO_map_.find(*module) != dynCO_map_.end()) {
+    return hipErrorAlreadyMapped;
+  }
+  dynCO_map_.insert(std::make_pair(*module, dynCo));
+
+  return hipSuccess;
+}
+
+hipError_t PlatformState::unloadModule(hipModule_t hmod) {
+  amd::ScopedLock lock(lock_);
+
+  auto it = dynCO_map_.find(hmod);
+  if (it == dynCO_map_.end()) {
+    return hipErrorNotFound;
+  }
+
+  delete it->second;
+  dynCO_map_.erase(hmod);
+
+  auto tex_it = texRef_map_.begin();
+  while (tex_it != texRef_map_.end()) {
+    if (tex_it->second.first == hmod) {
+      tex_it = texRef_map_.erase(tex_it);
+    } else {
+      ++tex_it;
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
+                                         const char* func_name) {
+  amd::ScopedLock lock(lock_);
+
+  auto it = dynCO_map_.find(hmod);
+  if (it == dynCO_map_.end()) {
+    LogPrintfError("Cannot find the module: 0x%x", hmod);
+    return hipErrorNotFound;
+  }
+  if (0 == strlen(func_name)) {
+    return hipErrorNotFound;
+  }
+
+  return it->second->getDynFunc(hfunc, func_name);
+}
+
+hipError_t PlatformState::getDynGlobalVar(const char* hostVar, hipModule_t hmod,
+                                          hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
+  amd::ScopedLock lock(lock_);
+
+  if(hostVar == nullptr || dev_ptr == nullptr || size_ptr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  auto it = dynCO_map_.find(hmod);
+  if (it == dynCO_map_.end()) {
+    LogPrintfError("Cannot find the module: 0x%x", hmod);
+    return hipErrorNotFound;
+  }
+
+  hip::DeviceVar* dvar = nullptr;
+  IHIP_RETURN_ONFAIL(it->second->getDeviceVar(&dvar, hostVar));
+  *dev_ptr = dvar->device_ptr();
+  *size_ptr = dvar->size();
+
+  return hipSuccess;
+}
+
+hipError_t PlatformState::registerTexRef(textureReference* texRef, hipModule_t hmod,
+                                         std::string name) {
+  amd::ScopedLock lock(lock_);
+  texRef_map_.insert(std::make_pair(texRef, std::make_pair(hmod, name)));
+  return hipSuccess;
+}
+
+hipError_t PlatformState::getDynTexGlobalVar(textureReference* texRef, hipDeviceptr_t* dev_ptr,
+                                             size_t* size_ptr) {
+  amd::ScopedLock lock(lock_);
+
+  auto tex_it = texRef_map_.find(texRef);
+  if (tex_it == texRef_map_.end()) {
+    LogPrintfError("Cannot find the texRef Entry: 0x%x", texRef);
+    return hipErrorNotFound;
+  }
+
+  auto it = dynCO_map_.find(tex_it->second.first);
+  if (it == dynCO_map_.end()) {
+    LogPrintfError("Cannot find the module: 0x%x", tex_it->second.first);
+    return hipErrorNotFound;
+  }
+
+  hip::DeviceVar* dvar = nullptr;
+  IHIP_RETURN_ONFAIL(it->second->getDeviceVar(&dvar, tex_it->second.second));
+  *dev_ptr = dvar->device_ptr();
+  *size_ptr = dvar->size();
+
+  return hipSuccess;
+}
+
+hipError_t PlatformState::getDynTexRef(const char* hostVar, hipModule_t hmod, textureReference** texRef) {
+  amd::ScopedLock lock(lock_);
+
+  auto it = dynCO_map_.find(hmod);
+  if (it == dynCO_map_.end()) {
+    LogPrintfError("Cannot find the module: 0x%x", hmod);
+    return hipErrorNotFound;
+  }
+
+  hip::DeviceVar* dvar = nullptr;
+  IHIP_RETURN_ONFAIL(it->second->getDeviceVar(&dvar, hostVar));
+
+  dvar->shadowVptr = new texture<char>();
+  *texRef =  reinterpret_cast<textureReference*>(dvar->shadowVptr);
+  return hipSuccess;
+}
+
+hipError_t PlatformState::digestFatBinary(const void* data, hip::FatBinaryInfo*& programs) {
+ return statCO_.digestFatBinary(data, programs);
+}
+
+hip::FatBinaryInfo** PlatformState::addFatBinary(const void* data) {
+  return statCO_.addFatBinary(data, initialized_);
+}
+
+hipError_t PlatformState::removeFatBinary(hip::FatBinaryInfo** module) {
+  return statCO_.removeFatBinary(module);
+}
+
+hipError_t PlatformState::registerStatFunction(const void* hostFunction, hip::Function* func) {
+  return statCO_.registerStatFunction(hostFunction, func);
+}
+
+hipError_t PlatformState::registerStatGlobalVar(const void* hostVar, hip::Var* var) {
+  return statCO_.registerStatGlobalVar(hostVar, var);
+}
+
+hipError_t PlatformState::registerStatManagedVar(hip::Var* var) {
+  return statCO_.registerStatManagedVar(var);
+}
+
+hipError_t PlatformState::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId) {
+  return statCO_.getStatFunc(hfunc, hostFunction, deviceId);
+}
+
+hipError_t PlatformState::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId) {
+  if(func_attr == nullptr || hostFunction == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  return statCO_.getStatFuncAttr(func_attr, hostFunction, deviceId);
+}
+
+hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                                           size_t* size_ptr) {
+  return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
+}
+
+hipError_t PlatformState::initStatManagedVarDevicePtr(int deviceId) {
+  return statCO_.initStatManagedVarDevicePtr(deviceId);
+}
+
+void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
+  auto& arguments = execStack_.top().arguments_;
+
+  if (arguments.size() < offset + size) {
+    arguments.resize(offset + size);
+  }
+
+  ::memcpy(&arguments[offset], arg, size);
+}
+
+void PlatformState::configureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
+                                  hipStream_t stream) {
+  execStack_.push(ihipExec_t{gridDim, blockDim, sharedMem, stream});
+}
+
+void PlatformState::popExec(ihipExec_t& exec) {
+  exec = std::move(execStack_.top());
+  execStack_.pop();
+}
diff --git a/rocclr/hip_platform.hpp b/rocclr/hip_platform.hpp
new file mode 100755
index 0000000000..8ab3c1c09e
--- /dev/null
+++ b/rocclr/hip_platform.hpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#pragma once
+
+#include "hip_internal.hpp"
+#include "hip_fatbin.hpp"
+#include "device/device.hpp"
+#include "hip_code_object.hpp"
+
+namespace hip_impl {
+
+hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
+    const amd::Device& device, hipFunction_t func, int  blockSize,
+    size_t dynamicSMemSize, bool bCalcPotentialBlkSz);
+} /* namespace hip_impl*/
+
+class PlatformState {
+  amd::Monitor lock_{"Guards PlatformState globals", true};
+
+  /* Singleton object */
+  static PlatformState* platform_;
+  PlatformState() {}
+  ~PlatformState() {}
+
+public:
+  void init();
+
+  //Dynamic Code Objects functions
+  hipError_t loadModule(hipModule_t* module, const char* fname, const void* image = nullptr);
+  hipError_t unloadModule(hipModule_t hmod);
+
+  hipError_t getDynFunc(hipFunction_t *hfunc, hipModule_t hmod, const char* func_name);
+  hipError_t getDynGlobalVar(const char* hostVar, hipModule_t hmod, hipDeviceptr_t* dev_ptr,
+                             size_t* size_ptr);
+  hipError_t getDynTexRef(const char* hostVar, hipModule_t hmod, textureReference** texRef);
+
+  hipError_t registerTexRef(textureReference* texRef, hipModule_t hmod, std::string name);
+  hipError_t getDynTexGlobalVar(textureReference* texRef, hipDeviceptr_t* dev_ptr,
+                                size_t* size_ptr);
+
+  /* Singleton instance */
+  static PlatformState& instance() {
+    if (platform_ == nullptr) {
+       // __hipRegisterFatBinary() will call this when app starts, thus
+       // there is no multiple entry issue here.
+       platform_ =  new PlatformState();
+    }
+    return *platform_;
+  }
+
+  //Static Code Objects functions
+  hip::FatBinaryInfo** addFatBinary(const void* data);
+  hipError_t removeFatBinary(hip::FatBinaryInfo** module);
+  hipError_t digestFatBinary(const void* data, hip::FatBinaryInfo*& programs);
+
+  hipError_t registerStatFunction(const void* hostFunction, hip::Function* func);
+  hipError_t registerStatGlobalVar(const void* hostVar, hip::Var* var);
+  hipError_t registerStatManagedVar(hip::Var* var);
+
+
+  hipError_t getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId);
+  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
+  hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                              size_t* size_ptr);
+
+  hipError_t initStatManagedVarDevicePtr(int deviceId);
+
+  //Exec Functions
+  void setupArgument(const void *arg, size_t size, size_t offset);
+  void configureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream);
+  void popExec(ihipExec_t& exec);
+
+private:
+  //Dynamic Code Object map, keyin module to get the corresponding object
+  std::unordered_map<hipModule_t, hip::DynCO*> dynCO_map_;
+  hip::StatCO statCO_; //Static Code object var
+  bool initialized_{false};
+  std::unordered_map<textureReference*, std::pair<hipModule_t, std::string>> texRef_map_;
+};
diff --git a/rocclr/hip_prof_api.h b/rocclr/hip_prof_api.h
new file mode 100644
index 0000000000..8773f9ac1f
--- /dev/null
+++ b/rocclr/hip_prof_api.h
@@ -0,0 +1,270 @@
+/* Copyright (c) 2019-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef HIP_SRC_HIP_PROF_API_H
+#define HIP_SRC_HIP_PROF_API_H
+
+#include <atomic>
+#include <iostream>
+#include <mutex>
+
+#if USE_PROF_API
+#include "hip/amd_detail/hip_prof_str.h"
+#include "platform/prof_protocol.h"
+
+// HIP API callbacks spawner object macro
+#define HIP_CB_SPAWNER_OBJECT(CB_ID) \
+  api_callbacks_spawner_t<HIP_API_ID_##CB_ID> __api_tracer; \
+  { \
+    hip_api_data_t* api_data = __api_tracer.get_api_data_ptr(); \
+    if (api_data != NULL) { \
+      hip_api_data_t& api_data_ref = *api_data; \
+      INIT_CB_ARGS_DATA(CB_ID, api_data_ref); \
+      __api_tracer.call(); \
+    } \
+  }
+
+static const uint32_t HIP_DOMAIN_ID = ACTIVITY_DOMAIN_HIP_API;
+typedef activity_record_t hip_api_record_t;
+typedef activity_rtapi_callback_t hip_api_callback_t;
+typedef activity_sync_callback_t hip_act_callback_t;
+
+class api_callbacks_table_t {
+ public:
+  typedef std::mutex mutex_t;
+
+  typedef hip_api_record_t record_t;
+  typedef hip_api_callback_t fun_t;
+  typedef hip_act_callback_t act_t;
+
+  // HIP API callbacks table
+  struct hip_cb_table_entry_t {
+    volatile std::atomic<bool> sync;
+    volatile std::atomic<uint32_t> sem;
+    act_t act;
+    void* a_arg;
+    fun_t fun;
+    void* arg;
+  };
+
+  struct hip_cb_table_t {
+    hip_cb_table_entry_t arr[HIP_API_ID_NUMBER];
+  };
+
+  api_callbacks_table_t() {
+     memset(&callbacks_table_, 0, sizeof(callbacks_table_));
+  }
+
+  bool set_activity(uint32_t id, act_t fun, void* arg) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    bool ret = true;
+
+    if (id < HIP_API_ID_NUMBER) {
+      cb_sync(id);
+      /*
+      'fun != nullptr' indicates it is activity register call,
+      increment should happen only once but client is free to call
+      register CB multiple times for same API id hence the check
+
+      'fun == nullptr' indicates it is de-register call and
+      decrement should happen only once hence the check
+      */
+      if (fun != nullptr) {
+        if (callbacks_table_.arr[id].act == nullptr) {
+          enabled_api_count_++;
+        }
+      } else {
+        if (callbacks_table_.arr[id].act != nullptr) {
+          enabled_api_count_--;
+        }
+      }
+      if (enabled_api_count_ > 0) {
+        amd::IS_PROFILER_ON = true;
+      } else {
+        amd::IS_PROFILER_ON = false;
+      }
+      callbacks_table_.arr[id].act = fun;
+      callbacks_table_.arr[id].a_arg = arg;
+      cb_release(id);
+    } else {
+      ret = false;
+    }
+
+    return ret;
+  }
+
+  bool set_callback(uint32_t id, fun_t fun, void* arg) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    bool ret = true;
+
+    if (id < HIP_API_ID_NUMBER) {
+      cb_sync(id);
+      callbacks_table_.arr[id].fun = fun;
+      callbacks_table_.arr[id].arg = arg;
+      cb_release(id);
+    } else {
+      ret = false;
+    }
+
+    return ret;
+  }
+
+  void set_enabled(const bool& enabled) {
+    amd::IS_PROFILER_ON = enabled;
+  }
+
+  inline hip_cb_table_entry_t& entry(const uint32_t& id) {
+    return callbacks_table_.arr[id];
+  }
+
+  inline void sem_sync(const uint32_t& id) {
+    sem_increment(id);
+    if (entry(id).sync.load() == true) sync_wait(id);
+  }
+
+  inline void sem_release(const uint32_t& id) {
+    sem_decrement(id);
+  }
+
+  inline bool is_enabled() const {
+    return amd::IS_PROFILER_ON;
+  }
+
+ private:
+  inline void cb_sync(const uint32_t& id) {
+    entry(id).sync.store(true);
+    while (entry(id).sem.load() != 0) {}
+  }
+
+  inline void cb_release(const uint32_t& id) {
+    entry(id).sync.store(false);
+  }
+
+  inline void sem_increment(const uint32_t& id) {
+    const uint32_t prev = entry(id).sem.fetch_add(1);
+    if (prev == UINT32_MAX) {
+      std::cerr << "sem overflow id = " << id << std::endl << std::flush;
+      abort();
+    }
+  }
+
+  inline void sem_decrement(const uint32_t& id) {
+    const uint32_t prev = entry(id).sem.fetch_sub(1);
+    if (prev == 0) {
+      std::cerr << "sem corrupted id = " << id << std::endl << std::flush;
+      abort();
+    }
+  }
+
+  void sync_wait(const uint32_t& id) {
+    sem_decrement(id);
+    while (entry(id).sync.load() == true) {}
+    sem_increment(id);
+  }
+
+  mutex_t mutex_;
+  hip_cb_table_t callbacks_table_;
+  uint32_t enabled_api_count_;
+};
+
+extern api_callbacks_table_t callbacks_table;
+
+template <int cid_>
+class api_callbacks_spawner_t {
+ public:
+  api_callbacks_spawner_t() :
+    api_data_(NULL)
+  {
+    if (!is_enabled()) return;
+
+    if (cid_ >= HIP_API_ID_NUMBER) {
+      fprintf(stderr, "HIP %s bad id %d\n", __FUNCTION__, cid_);
+      abort();
+    }
+    callbacks_table.sem_sync(cid_);
+
+    hip_act_callback_t act = entry(cid_).act;
+    if (act != NULL) api_data_ = (hip_api_data_t*) act(cid_, NULL, NULL, NULL);
+  }
+
+  void call() {
+    hip_api_callback_t fun = entry(cid_).fun;
+    void* arg = entry(cid_).arg;
+    if (fun != NULL) {
+      fun(HIP_DOMAIN_ID, cid_, api_data_, arg);
+      api_data_->phase = ACTIVITY_API_PHASE_EXIT;
+    }
+  }
+
+  ~api_callbacks_spawner_t() {
+    if (!is_enabled()) return;
+
+    if (api_data_ != NULL) {
+      hip_api_callback_t fun = entry(cid_).fun;
+      void* arg = entry(cid_).arg;
+      hip_act_callback_t act = entry(cid_).act;
+      void* a_arg = entry(cid_).a_arg;
+      if (fun != NULL) fun(HIP_DOMAIN_ID, cid_, api_data_, arg);
+      if (act != NULL) act(cid_, NULL, NULL, a_arg);
+    }
+
+    callbacks_table.sem_release(cid_);
+  }
+
+  hip_api_data_t* get_api_data_ptr() {
+    return api_data_;
+  }
+
+  bool is_enabled() const {
+    return callbacks_table.is_enabled();
+  }
+
+ private:
+  inline api_callbacks_table_t::hip_cb_table_entry_t& entry(const uint32_t& id) {
+    return callbacks_table.entry(id);
+  }
+
+  hip_api_data_t* api_data_;
+};
+
+template <>
+class api_callbacks_spawner_t<HIP_API_ID_NUMBER> {
+ public:
+  api_callbacks_spawner_t() {}
+  void call() {}
+  hip_api_data_t* get_api_data_ptr() { return NULL; }
+  bool is_enabled() const { return false; }
+};
+
+#else
+
+#define HIP_CB_SPAWNER_OBJECT(x) do {} while(0)
+
+class api_callbacks_table_t {
+ public:
+  typedef void* act_t;
+  typedef void* fun_t;
+  bool set_activity(uint32_t id, act_t fun, void* arg) { return false; }
+  bool set_callback(uint32_t id, fun_t fun, void* arg) { return false; }
+};
+
+#endif
+
+#endif  // HIP_SRC_HIP_PROF_API_H
diff --git a/rocclr/hip_prof_gen.py b/rocclr/hip_prof_gen.py
new file mode 100755
index 0000000000..aa44a70bd2
--- /dev/null
+++ b/rocclr/hip_prof_gen.py
@@ -0,0 +1,673 @@
+#!/usr/bin/python
+
+# Copyright (c) 2019-present Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import os, sys, re
+
+PROF_HEADER = "hip_prof_str.h"
+OUTPUT = PROF_HEADER
+REC_MAX_LEN = 1024
+
+# Recursive sources processing
+recursive_mode = 0
+# HIP_INIT_API macro patching
+hip_patch_mode = 0
+# API matching types check
+types_check_mode = 0
+# Private API check
+private_check_mode = 0
+
+# Messages and errors controll
+verbose = 0
+errexit = 0
+inp_file = 'none'
+line_num = -1
+
+# Verbose message
+def message(msg):
+  if verbose: sys.stdout.write(msg + '\n')
+
+# Fatal error termination
+def error(msg):
+  if line_num != -1:
+    msg += ", file '" + inp_file + "', line (" + str(line_num) + ")"
+  if errexit:
+    msg = " Error: " + msg
+  else:
+    msg = " Warning: " + msg
+
+  sys.stdout.write(msg + '\n')
+  sys.stderr.write(sys.argv[0] + msg +'\n')
+
+def fatal(msg):
+  error(msg)
+  sys.exit(1)
+
+#############################################################
+# Normalizing API name
+def filtr_api_name(name):
+  name = re.sub(r'\s*$', r'', name);
+  return name
+
+def filtr_api_decl(record):
+  record = re.sub("\s__dparm\([^\)]*\)", r'', record);
+  record = re.sub("\(void\*\)", r'', record);
+  return record
+
+# Normalizing API arguments
+def filtr_api_args(args_str):
+  args_str = re.sub(r'^\s*', r'', args_str);
+  args_str = re.sub(r'\s*$', r'', args_str);
+  args_str = re.sub(r'\s*,\s*', r',', args_str);
+  args_str = re.sub(r'\s+', r' ', args_str);
+  args_str = re.sub(r'\s*(\*+)\s*', r'\1 ', args_str);
+  args_str = re.sub(r'(enum|struct) ', '', args_str);
+  return args_str
+
+# Normalizing types
+def norm_api_types(type_str):
+  type_str = re.sub(r'uint32_t', r'unsigned int', type_str)
+  type_str = re.sub(r'^unsigned$', r'unsigned int', type_str)
+  return type_str
+
+# Creating a list of arguments [(type, name), ...]
+def list_api_args(args_str):
+  args_str = filtr_api_args(args_str)
+  args_list = []
+  if args_str != '':
+    for arg_pair in args_str.split(','):
+      if arg_pair == 'void': continue
+      arg_pair = re.sub(r'\s*=\s*\S+$','', arg_pair);
+      m = re.match("^(.*)\s(\S+)$", arg_pair);
+      if m:
+        arg_type = norm_api_types(m.group(1))
+        arg_name = m.group(2)
+        args_list.append((arg_type, arg_name))
+      else:
+        fatal("bad args: args_str: '" + args_str + "' arg_pair: '" + arg_pair + "'")
+  return args_list;
+
+# Creating arguments string "type0, type1, ..."
+def filtr_api_types(args_str):
+  args_list = list_api_args(args_str)
+  types_str = ''
+  for arg_tuple in args_list:
+    types_str += arg_tuple[0] + ', '
+  return types_str
+
+# Creating options list [opt0, opt1, ...]
+def filtr_api_opts(args_str):
+  args_list = list_api_args(args_str)
+  opts_list = []
+  for arg_tuple in args_list:
+    opts_list.append(arg_tuple[1])
+  return opts_list
+
+# Checking for pointer non-void arg type
+def pointer_ck(arg_type):
+  ptr_type = ''
+  m = re.match(r'(.*)\*$', arg_type)
+  if m:
+    ptr_type = m.group(1)
+    ptr_type = re.sub(r'const ', '', ptr_type)
+    if ptr_type == 'void': ptr_type = ''
+  return ptr_type
+#############################################################
+# Parsing API header
+# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+def parse_api(inp_file_p, out):
+  global inp_file
+  global line_num
+  inp_file = inp_file_p
+
+  beg_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(");
+  api_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(([^\)]*)\)");
+  end_pattern = re.compile("Texture");
+  hidden_pattern = re.compile(r'__attribute__\(\(visibility\("hidden"\)\)\)')
+  nms_open_pattern = re.compile(r'namespace hip_impl {')
+  nms_close_pattern = re.compile(r'}')
+
+  inp = open(inp_file, 'r')
+
+  found = 0
+  hidden = 0
+  nms_level = 0;
+  record = ""
+  line_num = -1
+
+  for line in inp.readlines():
+    record += re.sub(r'^\s+', r' ', line[:-1])
+    line_num += 1
+
+    if len(record) > REC_MAX_LEN:
+      fatal("bad record \"" + record + "\"")
+
+    m = beg_pattern.match(line)
+    if m:
+      name = m.group(2)
+      if hidden != 0:
+        message("api: " + name + " - hidden")
+      elif nms_level != 0:
+        message("api: " + name + " - hip_impl")
+      else:
+        message("api: " + name)
+        found = 1
+
+    if found != 0:
+      record = re.sub("\s__dparm\([^\)]*\)", '', record);
+      m = api_pattern.match(record)
+      if m:
+        found = 0
+        if end_pattern.search(record): break
+        api_name = filtr_api_name(m.group(2))
+        api_args = m.group(3)
+        if not api_name in out:
+          out[api_name] = api_args
+      else: continue
+
+    hidden = 0
+    if hidden_pattern.match(line): hidden = 1
+
+    if nms_open_pattern.match(line): nms_level += 1
+    if (nms_level > 0) and nms_close_pattern.match(line): nms_level -= 1
+    if nms_level < 0:
+      fatal("nms level < 0")
+
+    record = ""
+
+  inp.close()
+  line_num = -1
+#############################################################
+# Parsing API implementation
+# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset) {
+#    HIP_INIT_API(hipSetupArgument, arg, size, offset);
+# inp_file - input implementation source file
+# api_map - input public API map [<api name>] => <api args>
+# out - output map  [<api name>] => [opt0, opt1, ...]
+def parse_content(inp_file_p, api_map, out):
+  global hip_patch_mode
+  global types_check_mode
+  global private_check_mode
+  global inp_file
+  global line_num
+  inp_file = inp_file_p
+
+  # API method begin pattern
+  beg_pattern = re.compile("^(hipError_t|const char\s*\*)\s+[^\(]+\(");
+  # API declaration pattern
+  decl_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(([^\)]*)\)\s*;");
+  # API definition pattern
+  api_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(([^\)]*)\)\s*{");
+  # API init macro pattern
+  init_pattern = re.compile("(^\s*HIP_INIT_API\s*)\((([^,]+)(,.*|)|)(\);|,)\s*$");
+
+  # Open input file
+  inp = open(inp_file, 'r')
+
+  # API name
+  api_name = ""
+  # Valid public API found flag
+  api_valid = 0
+  # API overload (parameters mismatch)
+  api_overload = 0
+
+  # Input file patched content
+  content = ''
+  # Sub content for found API defiition
+  sub_content = ''
+  # Current record, accumulating several API definition related lines
+  record = ''
+  # Current input file line number
+  line_num = -1
+  # API beginning found flag
+  found = 0
+
+  # Reading input file
+  for line in inp.readlines():
+    # Accumulating record
+    record += re.sub(r'^\s+', r' ', line[:-1])
+    line_num += 1
+
+    if len(record) > REC_MAX_LEN:
+      fatal("bad record \"" + record + "\"")
+      break;
+
+    # Looking for API begin
+    if found == 0:
+      record = re.sub(r'\s*extern\s+"C"\s+', r'', record);
+      if beg_pattern.match(record):
+        found = 1
+        record = filtr_api_decl(record)
+
+    # Matching API declaration
+    if found == 1:
+      if decl_pattern.match(record):
+        found = 0
+
+    # Matching API definition
+    if found == 1:
+      m = api_pattern.match(record)
+      # Checking if complete API matched
+      if m:
+        found = 2
+        api_valid = 0
+        api_overload = 0
+        api_name = filtr_api_name(m.group(2))
+        # Checking if API name is in the API map
+        if (private_check_mode == 0) or (api_name in api_map):
+          if not api_name in api_map: api_map[api_name] = ''
+          # Getting API arguments
+          api_args = m.group(3)
+          # Getting etalon arguments from the API map
+          eta_args = api_map[api_name]
+          if eta_args == '':
+            eta_args = api_args
+            api_map[api_name] = eta_args
+          # Normalizing API arguments
+          api_types = filtr_api_types(api_args)
+          # Normalizing etalon arguments
+          eta_types = filtr_api_types(eta_args)
+          if (api_types == eta_types) or ((types_check_mode == 0) and (not api_name in out)):
+            # API is already found and not is mismatched
+            if (api_name in out):
+              fatal("API redefined \"" + api_name + "\", record \"" + record + "\"")
+            # Set valid public API found flag
+            api_valid = 1
+            # Set output API map with API arguments list
+            out[api_name] = filtr_api_opts(api_args)
+            # Register missmatched API methods
+          else:
+            api_overload = 1
+            # Warning about mismatched API, possible non public overloaded version
+            api_diff = '\t\t' + inp_file + " line(" + str(line_num) + ")\n\t\tapi: " + api_types + "\n\t\teta: " + eta_types
+            message("\t" + api_name + ' args mismatch:\n' + api_diff + '\n')
+
+    # API found action
+    if found == 2:
+      if hip_patch_mode != 0:
+        # Looking for INIT macro
+        m = init_pattern.match(line)
+        if m:
+          init_name = api_name
+          if api_overload == 1: init_name = 'NONE'
+          init_args = m.group(4)
+          line = m.group(1) + '(' + init_name + init_args + m.group(5) + '\n'
+
+      m = init_pattern.match(line)
+      if m:
+        found = 0
+        if api_valid == 1: message("\t" + api_name)
+        # Ignore if it is initialized as NONE
+        init_name = m.group(3)
+        if init_name != 'NONE':
+          # Check if init name matching API name
+          if init_name != api_name:
+            fatal("init name mismatch: '" + init_name +  "' <> '" + api_name + "'")
+          # Registering dummy API for non public API if the name in INIT is not NONE
+          if api_valid == 0:
+            # If init name is not in public API map then it is private API
+            # else it was not identified and will be checked on finish
+            if not init_name in api_map:
+              if init_name in out:
+                fatal("API reinit \"" + api_name + "\", record \"" + record + "\"")
+              out[init_name] = []
+      elif re.search('}', line):
+        found = 0
+        # Expect INIT macro for valid public API
+        # Removing and registering non-conformant APIs with missing HIP_INIT macro
+        if api_valid == 1:
+          if api_name in out:
+            del out[api_name]
+            del api_map[api_name]
+            # Registering non-conformant APIs
+            out['.' + api_name] = 1
+          else:
+            fatal("API is not in out \"" + api_name + "\", record \"" + record + "\"")
+
+    if found != 1: record = ""
+    content += line
+
+  inp.close()
+  line_num = -1
+
+  if len(out) != 0:
+    return content
+  else:
+    return ''
+
+# src path walk
+def parse_src(api_map, src_path, src_patt, out):
+  global recursive_mode
+
+  pattern = re.compile(src_patt)
+  src_path = re.sub(r'\s', '', src_path)
+  for src_dir in src_path.split(':'):
+    message("Parsing " + src_dir + " for '" + src_patt + "'")
+    for root, dirs, files in os.walk(src_dir):
+      for fnm in files:
+        if pattern.search(fnm):
+          file = root + '/' + fnm
+          message(file)
+          content = parse_content(file, api_map, out);
+          if (hip_patch_mode != 0) and (content != ''):
+            f = open(file, 'w')
+            f.write(content)
+            f.close()
+      if recursive_mode == 0: break
+#############################################################
+# Generating profiling primitives header
+# api_map - public API map [<api name>] => [(type, name), ...]
+# opts_map - opts map  [<api name>] => [opt0, opt1, ...]
+def generate_prof_header(f, api_map, opts_map):
+  # Private API list
+  priv_lst = []
+
+  f.write('// automatically generated sources\n')
+  f.write('#ifndef _HIP_PROF_STR_H\n');
+  f.write('#define _HIP_PROF_STR_H\n');
+  f.write('#define HIP_PROF_VER 1\n')
+
+  # Generating dummy macro for non-public API
+  f.write('\n// Dummy API primitives\n')
+  f.write('#define INIT_NONE_CB_ARGS_DATA(cb_data) {};\n')
+  for name in opts_map:
+    if not name in api_map:
+      opts_lst = opts_map[name]
+      if len(opts_lst) != 0:
+        fatal("bad dummy API \"" + name + "\", args: " + str(opts_lst))
+      f.write('#define INIT_'+ name + '_CB_ARGS_DATA(cb_data) {};\n')
+      priv_lst.append(name)
+
+  for name in priv_lst:
+    message("Private: " + name)
+
+  # Generating the callbacks ID enumaration
+  f.write('\n// HIP API callbacks ID enumaration\n')
+  f.write('enum hip_api_id_t {\n')
+  cb_id = 0
+  for name in api_map.keys():
+    f.write('  HIP_API_ID_' + name + ' = ' + str(cb_id) + ',\n')
+    cb_id += 1
+  f.write('  HIP_API_ID_NUMBER = ' + str(cb_id) + ',\n')
+  f.write('\n')
+  f.write('  HIP_API_ID_NONE = HIP_API_ID_NUMBER,\n')
+  for name in priv_lst:
+    f.write('  HIP_API_ID_' + name + ' = HIP_API_ID_NUMBER,\n')
+  f.write('};\n')
+
+  # Generating the method to return API name by ID
+  f.write('\n// Return HIP API string by given ID\n')
+  f.write('static inline const char* hip_api_name(const uint32_t id) {\n')
+  f.write('  switch(id) {\n')
+  for name in api_map.keys():
+    f.write('    case HIP_API_ID_' + name + ': return "' +  name + '";\n')
+  f.write('  };\n')
+  f.write('  return "unknown";\n')
+  f.write('};\n')
+
+  # Generating the method for querying API ID by name
+  f.write('\n')
+  f.write('#include <string.h>\n');
+  f.write('// Return HIP API ID by given name\n')
+  f.write('static inline uint32_t hipApiIdByName(const char* name) {\n')
+  for name, args in api_map.items():
+    f.write('  if (strcmp("' + name + '", name) == 0) return HIP_API_ID_' + name + ';\n')
+  f.write('  return HIP_API_ID_NUMBER;\n')
+  f.write('}\n')
+
+  # Generating the callbacks data structure
+  f.write('\n// HIP API callbacks data structure\n')
+  f.write(
+  'typedef struct hip_api_data_s {\n' +
+  '  uint64_t correlation_id;\n' +
+  '  uint32_t phase;\n' +
+  '  union {\n'
+  )
+  for name, args in api_map.items():
+    if len(args) != 0:
+      f.write('    struct {\n')
+      for arg_tuple in args:
+        arg_type = arg_tuple[0]
+        ptr_type = pointer_ck(arg_type)
+        arg_name = arg_tuple[1]
+        # Checking for enum type
+        if arg_type == "hipLimit_t": arg_type = 'enum ' + arg_type
+        # Structuer field code
+        f.write('      ' + arg_type + ' ' + arg_name + ';\n')
+        if ptr_type != '':
+          f.write('      ' + ptr_type + ' ' + arg_name + '__val;\n')
+      f.write('    } ' + name + ';\n')
+  f.write(
+  '  } args;\n' +
+  '} hip_api_data_t;\n'
+  )
+
+  # Generating the callbacks args data filling macros
+  f.write('\n// HIP API callbacks args data filling macros\n')
+  for name, args in api_map.items():
+    f.write('// ' + name + str(args) + '\n')
+    f.write('#define INIT_' + name + '_CB_ARGS_DATA(cb_data) { \\\n')
+    if name in opts_map:
+      opts_list = opts_map[name]
+      if len(args) != len(opts_list):
+        fatal("\"" + name + "\" API args and opts mismatch, args: " + str(args) + ", opts: " + str(opts_list))
+      # API args iterating:
+      #   type is args[<ind>][0]
+      #   name is args[<ind>][1]
+      for ind in range(0, len(args)):
+        arg_tuple = args[ind]
+        arg_type = arg_tuple[0]
+        ptr_type = pointer_ck(arg_type)
+        fld_name = arg_tuple[1]
+        opt_name = opts_list[ind]
+        if arg_type == "const char*":
+          f.write('  cb_data.args.' + name + '.' + fld_name + ' = (' + opt_name + ') ? strdup(' + opt_name + ') : NULL; \\\n')
+        else:
+          f.write('  cb_data.args.' + name + '.' + fld_name + ' = (' + arg_type + ')' + opt_name + '; \\\n')
+    f.write('};\n')
+  f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
+
+  f.write('#if HIP_PROF_HIP_API_STRING\n')
+  # Generating the method for the API args filling
+  f.write('\n')
+  f.write('// HIP API args filling method\n')
+  f.write('static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {\n')
+  f.write('  switch (id) {\n')
+  for name, args in api_map.items():
+    f.write('// ' + name + str(args) + '\n')
+    f.write('    case HIP_API_ID_' + name + ':\n')
+    for ind in range(0, len(args)):
+      arg_tuple = args[ind]
+      arg_type = arg_tuple[0]
+      ptr_type = pointer_ck(arg_type)
+      fld_name = arg_tuple[1]
+      var_name = 'data->args.' + name + '.' + fld_name
+      if arg_type == "char*":
+        f.write('      ' + var_name + ' = (' + var_name + ') ? strdup(' + var_name + ') : NULL;\n')
+      else:
+        if ptr_type != '':
+          f.write('      if (' + var_name + ') ' + var_name + '__val = *(' + var_name + ');\n')
+    f.write('      break;\n')
+  f.write('    default: break;\n')
+  f.write('  };\n')
+  f.write('}\n')
+
+  # Generating the method for the API string, name and parameters
+  f.write('\n')
+  f.write('#include <sstream>\n');
+  f.write('#include <string>\n');
+  f.write('// HIP API string method, method name and parameters\n')
+  f.write('static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
+  f.write('  std::ostringstream oss;\n')
+  f.write('  switch (id) {\n')
+  for name, args in api_map.items():
+    f.write('    case HIP_API_ID_' + name + ':\n')
+    f.write('      oss << "' + name + '(";\n')
+    for ind in range(0, len(args)):
+      arg_tuple = args[ind]
+      arg_type = arg_tuple[0]
+      ptr_type = pointer_ck(arg_type)
+      arg_name = arg_tuple[1]
+      var_name = 'data->args.' + name + '.' + arg_name
+      delim = '' if ind == 0 else ', ';
+      oss_stream = 'oss << "' + delim + arg_name  + '='
+      line_shift = '      '
+      f.write(line_shift)
+      if ptr_type != '':
+        f.write('if (' + var_name + ' == NULL) ' + oss_stream + 'NULL";\n' + line_shift + 'else ')
+        if pointer_ck(ptr_type) != '':
+          f.write(oss_stream + '" << (void*)' + var_name + '__val' + ';\n')
+        else:
+          f.write(oss_stream + '" << ' + var_name + '__val' + ';\n')
+      else:
+        f.write(oss_stream + '" << ' + var_name + ';\n')
+    f.write('      oss << ")";\n')
+    f.write('    break;\n')
+  f.write('    default: oss << "unknown";\n')
+  f.write('  };\n')
+  f.write('  return strdup(oss.str().c_str());\n')
+  f.write('}\n')
+  f.write('#endif  // HIP_PROF_HIP_API_STRING\n')
+
+  f.write('#endif  // _HIP_PROF_STR_H\n');
+
+#############################################################
+# main
+while len(sys.argv) > 1:
+  if not re.match(r'-', sys.argv[1]): break
+
+  if (sys.argv[1] == '-v'):
+    verbose = 1
+    sys.argv.pop(1)
+
+  if (sys.argv[1] == '-r'):
+    recursive_mode = 1
+    sys.argv.pop(1)
+
+  if (sys.argv[1] == '-t'):
+    types_check_mode = 1
+    sys.argv.pop(1)
+
+  if (sys.argv[1] == '--priv'):
+    private_check_mode = 1
+    sys.argv.pop(1)
+
+  if (sys.argv[1] == '-e'):
+    errexit = 1
+    sys.argv.pop(1)
+
+  if (sys.argv[1] == '-p'):
+    hip_patch_mode = 1
+    sys.argv.pop(1)
+
+# Usage
+if (len(sys.argv) < 3):
+  fatal ("Usage: " + sys.argv[0] + " [-v] <input HIP API .h file> <patched srcs path> [<output>]\n" +
+         "  -v - verbose messages\n" +
+         "  -r - process source directory recursively\n" +
+         "  -t - API types matching check\n" +
+         "  --priv - private API check\n" +
+         "  -e - on error exit mode\n" +
+         "  -p - HIP_INIT_API macro patching mode\n" +
+         "\n" +
+         "  Example:\n" +
+         "  $ " + sys.argv[0] + " -v -p -t --priv ./api/hip/include/hip/amd_detail/hip_runtime_api.h ./api/hip ./api/hip/include/hip/amd_detail/hip_prof_str.h");
+
+# API header file given as an argument
+src_pat = "\.cpp$"
+api_hfile = sys.argv[1]
+if not os.path.isfile(api_hfile):
+  fatal("input file '" + api_hfile + "' not found")
+
+# Srcs directory given as an argument
+src_dir = sys.argv[2]
+if not os.path.isdir(src_dir):
+  fatal("src directory " + src_dir + "' not found")
+
+if len(sys.argv) > 3: OUTPUT = sys.argv[3]
+
+# API declaration map
+api_map = {
+  'hipSetupArgument': '',
+  'hipMalloc3DArray': '',
+  'hipFuncGetAttribute': '',
+  'hipMemset3DAsync': '',
+  'hipKernelNameRef': '',
+  'hipStreamGetPriority': '',
+  'hipLaunchByPtr': '',
+  'hipFreeHost': '',
+  'hipGetErrorName': '',
+  'hipMemcpy3DAsync': '',
+  'hipMemcpyParam2DAsync': '',
+  'hipArray3DCreate': '',
+  'hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': '',
+  'hipOccupancyMaxPotentialBlockSize': '',
+  'hipMallocManaged': '',
+  'hipOccupancyMaxActiveBlocksPerMultiprocessor': '',
+  'hipGetErrorString': '',
+  'hipMallocHost': '',
+  'hipModuleLoadDataEx': '',
+  'hipGetDeviceProperties': '',
+  'hipConfigureCall': '',
+  'hipHccModuleLaunchKernel': '',
+  'hipExtModuleLaunchKernel': '',
+}
+# API options map
+opts_map = {}
+
+# Parsing API header
+parse_api(api_hfile, api_map)
+
+# Parsing sources
+parse_src(api_map, src_dir, src_pat, opts_map)
+
+# Checking for non-conformant APIs with missing HIP_INIT macro
+for name in list(opts_map.keys()):
+  m = re.match(r'\.(\S*)', name)
+  if m:
+    message("Init missing: " + m.group(1))
+    del opts_map[name]
+
+# Converting api map to map of lists
+# Checking for not found APIs
+not_found = 0
+if len(opts_map) != 0:
+  for name in api_map.keys():
+    args_str = api_map[name];
+    api_map[name] = list_api_args(args_str)
+    if not name in opts_map:
+      error("implementation not found: " + name)
+      not_found += 1
+if not_found != 0:
+  error(str(not_found) + " API calls missing in interception layer")
+
+# The output subdirectory seems to exist or not depending on the
+# version of cmake.
+output_dir = os.path.dirname(OUTPUT)
+if not os.path.exists(output_dir):
+  os.makedirs(output_dir)
+
+# Generating output header file
+with open(OUTPUT, 'w') as f:
+  generate_prof_header(f, api_map, opts_map)
+
+# Successfull exit
+sys.exit(0)
diff --git a/rocclr/hip_profile.cpp b/rocclr/hip_profile.cpp
new file mode 100644
index 0000000000..3422f428ea
--- /dev/null
+++ b/rocclr/hip_profile.cpp
@@ -0,0 +1,40 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipProfilerStart() {
+  HIP_INIT_API(hipProfilerStart);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+
+hipError_t hipProfilerStop() {
+  HIP_INIT_API(hipProfilerStop);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
diff --git a/rocclr/hip_rtc.cpp b/rocclr/hip_rtc.cpp
new file mode 100755
index 0000000000..66b6ad6e43
--- /dev/null
+++ b/rocclr/hip_rtc.cpp
@@ -0,0 +1,419 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hiprtc_internal.hpp"
+#include <hip/hiprtc.h>
+#include "platform/program.hpp"
+
+#ifdef __HIP_ENABLE_PCH
+extern const char __hip_pch[];
+extern unsigned __hip_pch_size;
+void __hipGetPCH(const char** pch, unsigned int *size) {
+  *pch = __hip_pch;
+  *size = __hip_pch_size;
+}
+#endif
+
+namespace hiprtc {
+thread_local hiprtcResult g_lastRtcError = HIPRTC_SUCCESS;
+}
+
+class ProgramState {
+  amd::Monitor lock_;
+private:
+  static ProgramState* programState_;
+
+  ProgramState() : lock_("Guards program state") {}
+  ~ProgramState() {}
+public:
+  std::unordered_map<amd::Program*,
+                     std::pair<std::vector<std::string>, std::vector<std::string>>> progHeaders_;
+
+  std::map<std::string, std::pair<std::string, std::string>> nameExpresssion_;
+
+  static ProgramState& instance();
+  uint32_t addNameExpression(const char* name_expression);
+  char* getLoweredName(const char* name_expression);
+};
+
+ProgramState* ProgramState::programState_ = nullptr;
+
+ProgramState& ProgramState::instance() {
+  if (programState_ == nullptr) {
+    programState_ = new ProgramState;
+  }
+  return *programState_;
+}
+
+uint32_t ProgramState::addNameExpression(const char* name_expression) {
+  amd::ScopedLock lock(lock_);
+
+  // Strip clean of any '(' or ')' or '&'
+  std::string strippedName(name_expression);
+  if (strippedName.back() == ')') {
+      strippedName.pop_back();
+      strippedName.erase(0, strippedName.find('('));
+  }
+  if (strippedName.front() == '&') {
+      strippedName.erase(0, 1);
+  }
+  auto it = nameExpresssion_.find(name_expression);
+  if (it == nameExpresssion_.end()) {
+    nameExpresssion_.insert(std::pair<std::string, std::pair<std::string, std::string>>
+                            (name_expression, std::make_pair(strippedName,"")));
+  }
+  return nameExpresssion_.size();
+}
+
+char* demangle(const char* loweredName) {
+  if (!loweredName) {
+    return nullptr;
+  }
+#if __linux__
+  int status = 0;
+  char* demangledName = DEMANGLE(loweredName, nullptr, nullptr, &status);
+  if (status != 0) {
+    LogPrintfError("Cannot demangle loweredName: %s \n", loweredName);
+    return nullptr;
+  }
+#elif defined(_WIN32)
+  char* demangledName = (char*)malloc(UNDECORATED_SIZE);
+
+  if (!UnDecorateSymbolName(loweredName, demangledName,
+                            UNDECORATED_SIZE/ sizeof(*demangledName), UNDNAME_COMPLETE))
+  {
+    free(demangledName);
+    LogPrintfError("Cannot undecorate loweredName: %s demangledName: %s \n",
+                      loweredName, demangledName);
+    return nullptr;
+  }
+#else
+#error "Only Linux and Windows are supported"
+#endif // __linux__
+  return demangledName;
+}
+
+static std::string handleMangledName(std::string name) {
+  std::string loweredName;
+  char* demangled = demangle(name.c_str());
+  loweredName.assign(demangled == nullptr ? std::string() : demangled);
+  free(demangled);
+
+  if (loweredName.empty()) {
+    return name;
+  }
+
+  if (loweredName.find(".kd") != std::string::npos) {
+    return {};
+  }
+
+  if (loweredName.find("void ") == 0) {
+    loweredName.erase(0, strlen("void "));
+  }
+
+  auto dx{loweredName.find_first_of("(<")};
+
+  if (dx == std::string::npos) {
+    return loweredName;
+  }
+
+  if (loweredName[dx] == '<') {
+    uint32_t count = 1;
+    do {
+        ++dx;
+        count += (loweredName[dx] == '<') ? 1 : ((loweredName[dx] == '>') ? -1 : 0);
+    } while (count);
+
+    loweredName.erase(++dx);
+  } else {
+    loweredName.erase(dx);
+  }
+
+  return loweredName;
+}
+
+static std::string getValueOf(const std::string& option) {
+  std::string res;
+  auto f = std::find(option.begin(), option.end(), '=');
+  if (f != option.end()) res = std::string(f + 1, option.end());
+  return res;
+}
+
+static void transformOptions(std::vector<std::string>& options, amd::Program* program) {
+  std::vector<const char*> t_option;
+  for (auto& i : options) {
+#ifdef __HIP_ENABLE_PCH
+    // Use precompiled header for hip
+    if (i == "-hip-pch") {
+      const char* pch = nullptr;
+      unsigned int pch_size = 0;
+      __hipGetPCH(&pch, &pch_size);
+      program->addPreCompiledHeader(std::string(pch, pch_size));
+      i = "-nogpuinc";
+      continue;
+    }
+#endif
+    // Some rtc samples use --gpu-architecture
+    if (i.rfind("--gpu-architecture=", 0) == 0) {
+      auto val = getValueOf(i);
+      i = "--offload-arch=" + val;
+      continue;
+    }
+  }
+}
+
+const char* hiprtcGetErrorString(hiprtcResult x) {
+  switch (x) {
+    case HIPRTC_SUCCESS:
+      return "HIPRTC_SUCCESS";
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      return "HIPRTC_ERROR_OUT_OF_MEMORY";
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE";
+    case HIPRTC_ERROR_INVALID_INPUT:
+      return "HIPRTC_ERROR_INVALID_INPUT";
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      return "HIPRTC_ERROR_INVALID_PROGRAM";
+    case HIPRTC_ERROR_INVALID_OPTION:
+      return "HIPRTC_ERROR_INVALID_OPTION";
+    case HIPRTC_ERROR_COMPILATION:
+      return "HIPRTC_ERROR_COMPILATION";
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE";
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION";
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return "HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION";
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID";
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      return "HIPRTC_ERROR_INTERNAL_ERROR";
+    default:
+      LogPrintfError("Invalid HIPRTC error code: %d \n", x);
+      return nullptr;
+  };
+
+  ShouldNotReachHere();
+
+  return nullptr;
+}
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+                                 int numHeaders, const char** headers, const char** headerNames) {
+  HIPRTC_INIT_API(prog, src, name, numHeaders, headers, headerNames);
+
+  if (prog == nullptr) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_PROGRAM);
+  }
+  if (numHeaders < 0) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+  if (numHeaders && (headers == nullptr || headerNames == nullptr)) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+
+  amd::Program* program = new amd::Program(*hip::getCurrentDevice()->asContext(), src, amd::Program::HIP,
+                                           numHeaders, headers, headerNames);
+  if (program == NULL) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+
+  if (CL_SUCCESS != program->addDeviceProgram(*hip::getCurrentDevice()->devices()[0])) {
+    program->release();
+    HIPRTC_RETURN(HIPRTC_ERROR_PROGRAM_CREATION_FAILURE);
+  }
+
+  *prog = reinterpret_cast<hiprtcProgram>(as_cl(program));
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+
+  // FIXME[skudchad] Add headers to amd::Program::build and device::Program::build,
+  // pass the saved from ProgramState to amd::Program::build
+  HIPRTC_INIT_API(prog, numOptions, options);
+
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+
+  std::ostringstream ostrstr;
+  std::vector<std::string> oarr(&options[0], &options[numOptions]);
+
+  const std::string hipVerOpt{"--hip-version=" + std::to_string(HIP_VERSION_MAJOR) + '.' +
+                              std::to_string(HIP_VERSION_MINOR) + '.' +
+                              std::to_string(HIP_VERSION_PATCH)};
+  const std::string hipVerMajor{"-DHIP_VERSION_MAJOR=" + std::to_string(HIP_VERSION_MAJOR)};
+  const std::string hipVerMinor{"-DHIP_VERSION_MINOR=" + std::to_string(HIP_VERSION_MINOR)};
+  const std::string hipVerPatch{"-DHIP_VERSION_PATCH=" + std::to_string(HIP_VERSION_PATCH)};
+
+  oarr.push_back(hipVerOpt);
+  oarr.push_back(hipVerMajor);
+  oarr.push_back(hipVerMinor);
+  oarr.push_back(hipVerPatch);
+
+  transformOptions(oarr, program);
+  std::copy(oarr.begin(), oarr.end(), std::ostream_iterator<std::string>(ostrstr, " "));
+
+  std::vector<amd::Device*> devices{hip::getCurrentDevice()->devices()[0]};
+  if (CL_SUCCESS != program->build(devices, ostrstr.str().c_str(), nullptr, nullptr)) {
+    HIPRTC_RETURN(HIPRTC_ERROR_COMPILATION);
+  }
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+  HIPRTC_INIT_API(prog, name_expression);
+
+  if (name_expression == nullptr) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+
+  uint32_t id = ProgramState::instance().addNameExpression(name_expression);
+
+  const auto var{"__hiprtc_" + std::to_string(id)};
+  const auto code{"\nextern \"C\" constexpr auto " + var + " = " + name_expression + ';'};
+
+  program->appendToSource(code.c_str());
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+                                  const char** loweredName) {
+  HIPRTC_INIT_API(prog, name_expression, loweredName);
+
+  if (name_expression == nullptr || loweredName == nullptr) {
+     HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+
+  device::Program* dev_program
+    = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  auto it = ProgramState::instance().nameExpresssion_.find(name_expression);
+  if (it == ProgramState::instance().nameExpresssion_.end()) {
+    return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+  }
+
+  std::string strippedName = it->second.first;
+  std::vector<std::string> mangledNames;
+
+  if (!dev_program->getLoweredNames(&mangledNames)) {
+    HIPRTC_RETURN(HIPRTC_ERROR_COMPILATION);
+  }
+
+  for (auto &name : mangledNames) {
+    std::string demangledName = handleMangledName(name);
+    if (demangledName == strippedName) {
+      it->second.second.assign(name);
+    }
+  }
+
+  *loweredName = it->second.second.c_str();
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+  HIPRTC_INIT_API(prog);
+
+  if (prog == NULL) {
+     HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+
+  // Release program. hiprtcProgram is a double pointer so free *prog
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(*prog));
+
+  program->release();
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* binaryMem) {
+ HIPRTC_INIT_API(prog, binaryMem);
+
+
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+  const device::Program::binary_t& binary =
+      program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0])->binary();
+
+  ::memcpy(binaryMem, binary.first, binary.second);
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* binarySizeRet) {
+
+  HIPRTC_INIT_API(prog, binarySizeRet);
+
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+
+  *binarySizeRet =
+      program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0])->binary().second;
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* dst) {
+
+  HIPRTC_INIT_API(prog, dst);
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+  const device::Program* devProgram =
+      program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  auto log = program->programLog() + devProgram->buildLog().c_str();
+
+  log.copy(dst, log.size());
+  dst[log.size()] = '\0';
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+
+  HIPRTC_INIT_API(prog, logSizeRet);
+
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(prog));
+  const device::Program* devProgram =
+      program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  auto log = program->programLog() + devProgram->buildLog().c_str();
+
+  *logSizeRet = log.size() + 1;
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor) {
+  HIPRTC_INIT_API(major, minor);
+
+  if (major == nullptr || minor == nullptr) {
+    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
+  }
+
+  *major = 9;
+  *minor = 0;
+
+  HIPRTC_RETURN(HIPRTC_SUCCESS);
+}
diff --git a/rocclr/hip_stream.cpp b/rocclr/hip_stream.cpp
new file mode 100755
index 0000000000..c490fd0fe9
--- /dev/null
+++ b/rocclr/hip_stream.cpp
@@ -0,0 +1,587 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "hip_event.hpp"
+#include "thread/monitor.hpp"
+#include "hip_prof_api.h"
+
+extern api_callbacks_table_t callbacks_table;
+
+static amd::Monitor streamSetLock{"Guards global stream set"};
+static std::unordered_set<hip::Stream*> streamSet;
+namespace hip {
+
+// ================================================================================================
+Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream,
+               const std::vector<uint32_t>& cuMask, hipStreamCaptureStatus captureStatus)
+    : queue_(nullptr),
+      lock_("Stream Callback lock"),
+      device_(dev),
+      priority_(p),
+      flags_(f),
+      null_(null_stream),
+      cuMask_(cuMask),
+      captureStatus_(captureStatus) {}
+
+// ================================================================================================
+Stream::~Stream() {
+  if (queue_ != nullptr) {
+    amd::ScopedLock lock(streamSetLock);
+    streamSet.erase(this);
+
+    queue_->release();
+    queue_ = nullptr;
+  }
+}
+
+hipError_t Stream::EndCapture() {
+  for (auto event : captureEvents_) {
+    hip::Event* e = reinterpret_cast<hip::Event*>(event);
+    e->EndCapture();
+  }
+  for (auto stream : parallelCaptureStreams_) {
+    hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+    s->EndCapture();
+  }
+  captureStatus_ = hipStreamCaptureStatusNone;
+  pCaptureGraph_ = nullptr;
+  originStream_ = false;
+  parentStream_ = nullptr;
+  lastCapturedNodes_.clear();
+  parallelCaptureStreams_.clear();
+  captureEvents_.clear();
+
+  return hipSuccess;
+}
+// ================================================================================================
+bool Stream::Create() {
+  // Enable queue profiling if a profiler is attached which sets the callback_table flag
+  // or if we force it with env var. This would enable time stamp collection for every
+  // command submitted to the stream(queue).
+  bool isProfilerAttached = callbacks_table.is_enabled();
+  cl_command_queue_properties properties = (isProfilerAttached || HIP_FORCE_QUEUE_PROFILING) ?
+                                             CL_QUEUE_PROFILING_ENABLE : 0;
+  amd::CommandQueue::Priority p;
+  switch (priority_) {
+    case Priority::High:
+      p = amd::CommandQueue::Priority::High;
+      break;
+    case Priority::Low:
+      p = amd::CommandQueue::Priority::Low;
+      break;
+    case Priority::Normal:
+    default:
+      p = amd::CommandQueue::Priority::Normal;
+      break;
+  }
+  amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0],
+                                             properties, amd::CommandQueue::RealTimeDisabled,
+                                             p, cuMask_);
+
+  // Create a host queue
+  bool result = (queue != nullptr) ? queue->create() : false;
+  // Insert just created stream into the list of the blocking queues
+  if (result) {
+    amd::ScopedLock lock(streamSetLock);
+    streamSet.insert(this);
+    queue_ = queue;
+    queue->vdev()->profilerAttach(isProfilerAttached);
+  } else if (queue != nullptr) {
+    queue->release();
+  }
+
+  return result;
+}
+
+// ================================================================================================
+amd::HostQueue* Stream::asHostQueue(bool skip_alloc) {
+  if (queue_ != nullptr) {
+    return queue_;
+  }
+  // Access to the stream object is lock protected, because possible allocation
+  amd::ScopedLock l(Lock());
+  if (queue_ == nullptr) {
+    // Create the host queue for the first time
+    if (!skip_alloc) {
+      Create();
+    }
+  }
+  return queue_;
+}
+
+// ================================================================================================
+void Stream::Finish() const {
+  if (queue_ != nullptr) {
+    queue_->finish();
+  }
+}
+
+// ================================================================================================
+int Stream::DeviceId() const {
+  return device_->deviceId();
+}
+
+int Stream::DeviceId(const hipStream_t hStream) {
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(hStream);
+  int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice();
+  assert(deviceId >= 0 && deviceId < static_cast<int>(g_devices.size()));
+  return deviceId;
+}
+
+void Stream::syncNonBlockingStreams() {
+  amd::ScopedLock lock(streamSetLock);
+  for (auto& it : streamSet) {
+    if (it->Flags() & hipStreamNonBlocking) {
+      it->asHostQueue()->finish();
+    }
+  }
+}
+
+// ================================================================================================
+bool isValid(hipStream_t stream) {
+  // NULL stream is always valid
+  if (stream == nullptr) {
+    return true;
+  }
+
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  amd::ScopedLock lock(streamSetLock);
+  if (streamSet.find(s) == streamSet.end()) {
+    return false;
+  }
+  return true;
+}
+
+};// hip namespace
+
+// ================================================================================================
+void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream) {
+  amd::Command::EventWaitList eventWaitList;
+  {
+    amd::ScopedLock lock(streamSetLock);
+
+    for (const auto& stream : streamSet) {
+      amd::HostQueue* active_queue = stream->asHostQueue();
+      // If it's the current device
+      if ((&active_queue->device() == &blocking_queue->device()) &&
+          // Make sure it's a default stream
+          ((stream->Flags() & hipStreamNonBlocking) == 0) &&
+          // and it's not the current stream
+          (active_queue != blocking_queue) &&
+          // check for a wait on the null stream
+          (stream->Null() == wait_null_stream)) {
+        // Get the last valid command
+        amd::Command* command = active_queue->getLastQueuedCommand(true);
+        if (command != nullptr) {
+          // Check the current active status
+          if (command->status() != CL_COMPLETE) {
+            command->notifyCmdQueue();
+            eventWaitList.push_back(command);
+          } else {
+            command->release();
+          }
+        }
+        // Nullstream, hence there is nothing else to wait
+        if (wait_null_stream) {
+          break;
+        }
+      }
+    }
+  }
+
+  // Check if we have to wait anything
+  if (eventWaitList.size() > 0) {
+    amd::Command* command = new amd::Marker(*blocking_queue, kMarkerDisableFlush, eventWaitList);
+    if (command != nullptr) {
+      command->enqueue();
+      command->release();
+    }
+  }
+
+  // Release all active commands. It's safe after the marker was enqueued
+  for (const auto& it : eventWaitList) {
+    it->release();
+  }
+}
+
+// ================================================================================================
+void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status, void* user_data) {
+  hipError_t status = hipSuccess;
+  StreamCallback* cbo = reinterpret_cast<StreamCallback*>(user_data);
+  cbo->callBack_(cbo->stream_, status, cbo->userData_);
+  cbo->command_->release();
+  delete cbo;
+}
+
+// ================================================================================================
+static hipError_t ihipStreamCreate(hipStream_t* stream,
+                                  unsigned int flags, hip::Stream::Priority priority,
+                                  const std::vector<uint32_t>& cuMask = {}) {
+  if (flags != hipStreamDefault && flags != hipStreamNonBlocking) {
+    return hipErrorInvalidValue;
+  }
+  hip::Stream* hStream = new hip::Stream(hip::getCurrentDevice(), priority, flags, false, cuMask);
+
+  if (hStream == nullptr || !hStream->Create()) {
+    delete hStream;
+    return hipErrorOutOfMemory;
+  }
+
+  *stream = reinterpret_cast<hipStream_t>(hStream);
+
+  return hipSuccess;
+}
+
+// ================================================================================================
+hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) {
+  HIP_INIT_API(hipStreamCreateWithFlags, stream, flags);
+
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(ihipStreamCreate(stream, flags, hip::Stream::Priority::Normal), *stream);
+}
+
+// ================================================================================================
+hipError_t hipStreamCreate(hipStream_t *stream) {
+  HIP_INIT_API(hipStreamCreate, stream);
+
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal), *stream);
+}
+
+// ================================================================================================
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+  HIP_INIT_API(hipStreamCreateWithPriority, stream, flags, priority);
+
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hip::Stream::Priority streamPriority;
+  if (priority <= hip::Stream::Priority::High) {
+    streamPriority = hip::Stream::Priority::High;
+  } else if (priority >= hip::Stream::Priority::Low) {
+    streamPriority = hip::Stream::Priority::Low;
+  } else {
+    streamPriority = hip::Stream::Priority::Normal;
+  }
+
+  HIP_RETURN(ihipStreamCreate(stream, flags, streamPriority), *stream);
+}
+
+// ================================================================================================
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+  HIP_INIT_API(hipDeviceGetStreamPriorityRange, leastPriority, greatestPriority);
+
+  if (leastPriority != nullptr) {
+    *leastPriority = hip::Stream::Priority::Low;
+  }
+  if (greatestPriority != nullptr) {
+    *greatestPriority = hip::Stream::Priority::High;
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags) {
+  HIP_INIT_API(hipStreamGetFlags, stream, flags);
+
+  if ((flags != nullptr) && (stream != nullptr)) {
+    if (!hip::isValid(stream)) {
+      return HIP_RETURN(hipErrorContextIsDestroyed);
+    }
+    *flags = reinterpret_cast<hip::Stream*>(stream)->Flags();
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipStreamSynchronize(hipStream_t stream) {
+  HIP_INIT_API(hipStreamSynchronize, stream);
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  // Wait for the current host queue
+  hip::getQueue(stream)->finish();
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipStreamDestroy(hipStream_t stream) {
+  HIP_INIT_API(hipStreamDestroy, stream);
+
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  delete reinterpret_cast<hip::Stream*>(stream);
+
+  HIP_RETURN(hipSuccess);
+}
+
+struct CallbackData {
+    int previous_read_index;
+    hip::ihipIpcEventShmem_t *shmem;
+};
+
+void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data){
+    CallbackData *data = (CallbackData *)user_data;
+    int offset = data->previous_read_index % IPC_SIGNALS_PER_EVENT;
+    while (data->shmem->read_index < data->previous_read_index+IPC_SIGNALS_PER_EVENT
+                    && data->shmem->signal[offset] != 0) {
+    }
+    delete data;
+}
+
+// ================================================================================================
+hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) {
+  HIP_INIT_API(hipStreamWaitEvent, stream, event, flags);
+
+  EVENT_CAPTURE(hipStreamWaitEvent, event, stream, flags);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if (e->flags & hipEventInterprocess) {
+    amd::Command* command = new amd::Marker(*queue, false);
+    auto t{new CallbackData{e->ipc_evt_.ipc_shmem_->read_index, e->ipc_evt_.ipc_shmem_}};
+    StreamCallback* cbo = new StreamCallback(stream,
+                    reinterpret_cast<hipStreamCallback_t> (WaitThenDecrementSignal), t, command);
+    if (!command->setCallback(CL_COMPLETE, ihipStreamCallback,cbo)) {
+      command->release();
+      return hipErrorInvalidHandle;
+    }
+    command->enqueue();
+    command->awaitCompletion();
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(e->streamWait(queue, flags));
+  }
+}
+
+// ================================================================================================
+hipError_t hipStreamQuery(hipStream_t stream) {
+  HIP_INIT_API(hipStreamQuery, stream);
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  amd::HostQueue* hostQueue = hip::getQueue(stream);
+
+  amd::Command* command = hostQueue->getLastQueuedCommand(true);
+  if (command == nullptr) {
+    // Nothing was submitted to the queue
+    HIP_RETURN(hipSuccess);
+  }
+
+  amd::Event& event = command->event();
+  if (command->type() != 0) {
+    event.notifyCmdQueue();
+  }
+  hipError_t status = (command->status() == CL_COMPLETE) ? hipSuccess : hipErrorNotReady;
+  command->release();
+  HIP_RETURN(status);
+}
+
+// ================================================================================================
+hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+                                unsigned int flags) {
+  HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags);
+  //flags - Reserved for future use, must be 0
+  if (callback == nullptr || flags != 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  amd::HostQueue* hostQueue = hip::getQueue(stream);
+  amd::Command* last_command = hostQueue->getLastQueuedCommand(true);
+  amd::Command::EventWaitList eventWaitList;
+  if (last_command != nullptr) {
+    eventWaitList.push_back(last_command);
+  }
+  amd::Command* command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList);
+  if (command == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::Event& event = command->event();
+  StreamCallback* cbo = new StreamCallback(stream, callback, userData, command);
+
+  if ((cbo == nullptr) || !event.setCallback(CL_COMPLETE, ihipStreamCallback, cbo)) {
+    command->release();
+    if (last_command != nullptr) {
+      last_command->release();
+    }
+    return hipErrorInvalidHandle;
+  }
+  command->enqueue();
+  // @note: don't release the command here, because it will be released after HIP callback
+
+  if (last_command != nullptr) {
+    last_command->release();
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+// ================================================================================================
+hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize,
+                                        const uint32_t* cuMask) {
+  HIP_INIT_API(hipExtStreamCreateWithCUMask, stream, cuMaskSize, cuMask);
+
+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+  if (cuMaskSize == 0 || cuMask == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const std::vector<uint32_t> cuMaskv(cuMask, cuMask + cuMaskSize);
+
+  HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal, cuMaskv), *stream);
+}
+
+// ================================================================================================
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) {
+  HIP_INIT_API(hipStreamGetPriority, stream, priority);
+  if ((priority != nullptr) && (stream != nullptr)) {
+    if (!hip::isValid(stream)) {
+      return HIP_RETURN(hipErrorContextIsDestroyed);
+    }
+    *priority = static_cast<int>(reinterpret_cast<hip::Stream*>(stream)->GetPriority());
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+
+}
+
+// ================================================================================================
+hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask) {
+  HIP_INIT_API(hipExtStreamGetCUMask, stream, cuMaskSize, cuMask);
+
+  if (cuMask == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  int deviceId = hip::getCurrentDevice()->deviceId();
+  auto* deviceHandle = g_devices[deviceId]->devices()[0];
+  const auto& info = deviceHandle->info();
+
+  // find the minimum cuMaskSize required to present the CU mask bit-array in a patch of 32 bits
+  // and return error if the cuMaskSize argument is less than cuMaskSizeRequired
+  uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 +
+    ((info.maxComputeUnits_ % 32) ? 1 : 0);
+
+  if (cuMaskSize < cuMaskSizeRequired) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // make a default CU mask bit-array where all CUs are active
+  // this default mask will be returned when there is no
+  // custom or global CU mask defined
+  std::vector<uint32_t> defaultCUMask;
+  uint32_t temp = 0;
+  uint32_t bit_index = 0;
+  for (uint32_t i = 0; i < info.maxComputeUnits_; i++) {
+    temp |= 1UL << bit_index;
+    if (bit_index >= 32) {
+      defaultCUMask.push_back(temp);
+      temp = 0;
+      bit_index = 0;
+      temp |= 1UL << bit_index;
+    }
+    bit_index += 1;
+  }
+  if (bit_index != 0) {
+    defaultCUMask.push_back(temp);
+  }
+
+  // if the stream is null then either return globalCUMask_ (if it is defined)
+  // or return defaultCUMask
+  if (stream == nullptr) {
+    if (info.globalCUMask_.size() != 0) {
+      std::copy(info.globalCUMask_.begin(), info.globalCUMask_.end(), cuMask);
+    } else {
+      std::copy(defaultCUMask.begin(), defaultCUMask.end(), cuMask);
+    }
+  } else {
+  // if the stream is not null then get the stream's CU mask and return one of the below cases
+  // case1 if globalCUMask_ is defined then return the AND of globalCUMask_ and stream's CU mask
+  // case2 if globalCUMask_ is not defined then retuen AND of defaultCUMask and stream's CU mask
+  // in both cases above if stream's CU mask is empty then either globalCUMask_ (for case1)
+  // or defaultCUMask(for case2) will be returned
+    std::vector<uint32_t> streamCUMask;
+    streamCUMask = reinterpret_cast<hip::Stream*>(stream)->GetCUMask();
+    std::vector<uint32_t> mask = {};
+    if (info.globalCUMask_.size() != 0) {
+      for (uint32_t i = 0; i < std::min(streamCUMask.size(), info.globalCUMask_.size()); i++) {
+        mask.push_back(streamCUMask[i] & info.globalCUMask_[i]);
+      }
+    } else {
+      for (uint32_t i = 0; i < std::min(streamCUMask.size(), defaultCUMask.size()); i++) {
+        mask.push_back(streamCUMask[i] & defaultCUMask[i]);
+      }
+      // check to make sure after ANDing streamCUMask (custom-defined) with global CU mask,
+      //we have non-zero mask, oterwise just return either globalCUMask_ or defaultCUMask
+      bool zeroCUMask = true;
+      for (auto m : mask) {
+        if (m != 0) {
+          zeroCUMask = false;
+          break;
+        }
+      }
+      if (zeroCUMask) {
+        mask = (info.globalCUMask_.size() != 0) ? info.globalCUMask_ : defaultCUMask;
+      }
+      std::copy(mask.begin(), mask.end(), cuMask);
+    }
+  }
+  HIP_RETURN(hipSuccess);
+}
diff --git a/rocclr/hip_stream_ops.cpp b/rocclr/hip_stream_ops.cpp
new file mode 100644
index 0000000000..06018e639c
--- /dev/null
+++ b/rocclr/hip_stream_ops.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "platform/command_utils.hpp"
+
+hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
+                               int64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes) {
+  size_t offset = 0;
+  unsigned int outFlags = 0;
+
+  amd::Memory* memory = getMemoryObject(ptr, offset);
+  if (!memory) {
+    return hipErrorInvalidValue;
+  }
+
+  // NOTE: 'mask' is only used in Wait operation, 'sizeBytes' is only used in Write operation
+  // 'flags' for now used only for Wait, but in future there will usecases for Write too.
+
+  if (cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
+    // Wait is only supported on SignalMemory objects
+    if (!(memory->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) {
+      return hipErrorInvalidValue;
+    }
+    switch (flags) {
+      case hipStreamWaitValueGte:
+        outFlags = ROCCLR_STREAM_WAIT_VALUE_GTE;
+      break;
+      case hipStreamWaitValueEq:
+        outFlags = ROCCLR_STREAM_WAIT_VALUE_EQ;
+      break;
+      case hipStreamWaitValueAnd:
+        outFlags = ROCCLR_STREAM_WAIT_VALUE_AND;
+      break;
+      case hipStreamWaitValueNor:
+        outFlags = ROCCLR_STREAM_WAIT_VALUE_NOR;
+      break;
+      default:
+        ShouldNotReachHere();
+      break;
+    }
+  } else if (cmdType != ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
+    return hipErrorInvalidValue;
+  }
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+  amd::Command::EventWaitList waitList;
+
+  amd::StreamOperationCommand* command =
+    new amd::StreamOperationCommand(*queue, cmdType, waitList, *memory->asBuffer(),
+                                    value, mask, outFlags, offset, sizeBytes);
+
+  if (command == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  command->enqueue();
+  command->release();
+  return hipSuccess;
+}
+
+hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags,
+                                uint32_t mask) {
+  HIP_INIT_API(hipStreamWaitValue32, stream, ptr, value, mask, flags);
+  // NOTE: ptr corresponds to a HSA Signal memeory which is 64 bits.
+  // 32 bit value and mask are converted to 64-bit values.
+  HIP_RETURN_DURATION(ihipStreamOperation(
+      stream,
+      ROCCLR_COMMAND_STREAM_WAIT_VALUE,
+      ptr,
+      value,
+      mask,
+      flags,
+      0));  // sizeBytes un-used for wait, set it to 0
+}
+
+hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags,
+                                uint64_t mask) {
+  HIP_INIT_API(hipStreamWaitValue64, stream, ptr, value, mask, flags);
+  HIP_RETURN_DURATION(ihipStreamOperation(
+      stream,
+      ROCCLR_COMMAND_STREAM_WAIT_VALUE,
+      ptr,
+      value,
+      mask,
+      flags,
+      0));  // sizeBytes un-used for wait, set it to 0
+}
+
+hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, int32_t value, unsigned int flags) {
+  HIP_INIT_API(hipStreamWriteValue32, stream, ptr, value, flags);
+  HIP_RETURN_DURATION(ihipStreamOperation(
+      stream,
+      ROCCLR_COMMAND_STREAM_WRITE_VALUE,
+      ptr,
+      value,
+      0,  // mask un-used set it to 0
+      0,  // flags un-used for now set it to 0
+      4));
+}
+
+hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, int64_t value, unsigned int flags) {
+  HIP_INIT_API(hipStreamWriteValue64, stream, ptr, value, flags);
+  HIP_RETURN_DURATION(ihipStreamOperation(
+      stream,
+      ROCCLR_COMMAND_STREAM_WRITE_VALUE,
+      ptr,
+      value,
+      0,  // mask un-used set it to 0
+      0,  // flags un-used for now set it to 0
+      8));
+}
diff --git a/rocclr/hip_surface.cpp b/rocclr/hip_surface.cpp
new file mode 100644
index 0000000000..c88e3ea3cf
--- /dev/null
+++ b/rocclr/hip_surface.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+#include <hip/amd_detail/hip_surface_types.h>
+
+hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
+                                  const hipResourceDesc* pResDesc) {
+  HIP_INIT_API(hipCreateSurfaceObject, pSurfObject, pResDesc);
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
+  HIP_INIT_API(hipDestroySurfaceObject, surfaceObject);
+
+  HIP_RETURN(hipErrorNotSupported);
+}
diff --git a/rocclr/hip_texture.cpp b/rocclr/hip_texture.cpp
new file mode 100755
index 0000000000..304cc818a1
--- /dev/null
+++ b/rocclr/hip_texture.cpp
@@ -0,0 +1,1303 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include <hip/amd_detail/texture_types.h>
+#include "hip_internal.hpp"
+#include "hip_platform.hpp"
+#include "hip_conversions.hpp"
+#include "platform/sampler.hpp"
+
+hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                      amd::HostQueue& queue, bool isAsync = false);
+
+struct __hip_texture {
+  uint32_t imageSRD[HIP_IMAGE_OBJECT_SIZE_DWORD];
+  uint32_t samplerSRD[HIP_SAMPLER_OBJECT_SIZE_DWORD];
+  amd::Image* image;
+  amd::Sampler* sampler;
+  hipResourceDesc resDesc;
+  hipTextureDesc texDesc;
+  hipResourceViewDesc resViewDesc;
+
+  __hip_texture(amd::Image* image_,
+                amd::Sampler* sampler_,
+                const hipResourceDesc& resDesc_,
+                const hipTextureDesc& texDesc_,
+                const hipResourceViewDesc& resViewDesc_) :
+    image(image_),
+    sampler(sampler_),
+    resDesc(resDesc_),
+    texDesc(texDesc_),
+    resViewDesc(resViewDesc_) {
+    amd::Context& context = *hip::getCurrentDevice()->asContext();
+    amd::Device& device = *context.devices()[0];
+
+    device::Memory* imageMem = image->getDeviceMemory(device);
+    std::memcpy(imageSRD, imageMem->cpuSrd(), sizeof(imageSRD));
+
+    device::Sampler* samplerMem = sampler->getDeviceSampler(device);
+    std::memcpy(samplerSRD, samplerMem->hwState(), sizeof(samplerSRD));
+  }
+};
+
+amd::Image* ihipImageCreate(const cl_channel_order channelOrder,
+                            const cl_channel_type channelType,
+                            const cl_mem_object_type imageType,
+                            const size_t imageWidth,
+                            const size_t imageHeight,
+                            const size_t imageDepth,
+                            const size_t imageArraySize,
+                            const size_t imageRowPitch,
+                            const size_t imageSlicePitch,
+                            const uint32_t numMipLevels,
+                            amd::Memory* buffer);
+
+hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
+                                   const hipResourceDesc* pResDesc,
+                                   const hipTextureDesc* pTexDesc,
+                                   const hipResourceViewDesc* pResViewDesc) {
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  const device::Info& info = device->info();
+
+  // Validate input params
+  if (pTexObject == nullptr || pResDesc == nullptr || pTexDesc == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  // pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped array.
+  if ((pResViewDesc != nullptr) &&
+      ((pResDesc->resType != hipResourceTypeArray) && (pResDesc->resType != hipResourceTypeMipmappedArray))) {
+    return hipErrorInvalidValue;
+  }
+
+  // If hipResourceDesc::resType is set to hipResourceTypeArray,
+  // hipResourceDesc::res::array::array must be set to a valid HIP array handle.
+  if ((pResDesc->resType == hipResourceTypeArray) &&
+      (pResDesc->res.array.array == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  // If hipResourceDesc::resType is set to hipResourceTypeMipmappedArray,
+  // hipResourceDesc::res::mipmap::mipmap must be set to a valid HIP mipmapped array handle
+  // and hipTextureDesc::normalizedCoords must be set to true.
+  if ((pResDesc->resType == hipResourceTypeMipmappedArray) &&
+      ((pResDesc->res.mipmap.mipmap == nullptr) || (pTexDesc->normalizedCoords == 0))) {
+    return hipErrorInvalidValue;
+  }
+
+  // If hipResourceDesc::resType is set to hipResourceTypeLinear,
+  // hipResourceDesc::res::linear::devPtr must be set to a valid device pointer, that is aligned to hipDeviceProp::textureAlignment.
+  // The total number of elements in the linear address range cannot exceed hipDeviceProp::maxTexture1DLinear.
+  if ((pResDesc->resType == hipResourceTypeLinear) &&
+      ((pResDesc->res.linear.devPtr == nullptr) ||
+       (!amd::isMultipleOf(pResDesc->res.linear.devPtr, info.imageBaseAddressAlignment_)) ||
+       ((pResDesc->res.linear.sizeInBytes / hip::getElementSize(pResDesc->res.linear.desc)) >= info.imageMaxBufferSize_))) {
+    return hipErrorInvalidValue;
+  }
+
+  // If hipResourceDesc::resType is set to hipResourceTypePitch2D,
+  // hipResourceDesc::res::pitch2D::devPtr must be set to a valid device pointer, that is aligned to hipDeviceProp::textureAlignment.
+  // hipResourceDesc::res::pitch2D::width and hipResourceDesc::res::pitch2D::height specify the width and height of the array in elements,
+  // and cannot exceed hipDeviceProp::maxTexture2DLinear[0] and hipDeviceProp::maxTexture2DLinear[1] respectively.
+  // hipResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to hipDeviceProp::texturePitchAlignment.
+  // Pitch cannot exceed hipDeviceProp::maxTexture2DLinear[2].
+  if ((pResDesc->resType == hipResourceTypePitch2D) &&
+      ((pResDesc->res.pitch2D.devPtr == nullptr) ||
+       (!amd::isMultipleOf(pResDesc->res.pitch2D.devPtr, info.imageBaseAddressAlignment_)) ||
+       (pResDesc->res.pitch2D.width >= info.image2DMaxWidth_) ||
+       (pResDesc->res.pitch2D.height >= info.image2DMaxHeight_) ||
+       (!amd::isMultipleOf(pResDesc->res.pitch2D.pitchInBytes, info.imagePitchAlignment_)))) {
+    // TODO check pitch limits.
+    return hipErrorInvalidValue;
+  }
+
+  // Mipmaps are currently not supported.
+  if (pResDesc->resType == hipResourceTypeMipmappedArray) {
+    return hipErrorNotSupported;
+  }
+  // We don't program the border_color_ptr field in the HW sampler SRD.
+  if (pTexDesc->addressMode[0] == hipAddressModeBorder) {
+    return hipErrorNotSupported;
+  }
+  // We don't program the max_ansio_ratio field in the the HW sampler SRD.
+  if (pTexDesc->maxAnisotropy != 0) {
+    return hipErrorNotSupported;
+  }
+  // We don't program the lod_bias field in the HW sampler SRD.
+  if (pTexDesc->mipmapLevelBias != 0) {
+    return hipErrorNotSupported;
+  }
+  // We don't program the min_lod field in the HW sampler SRD.
+  if (pTexDesc->minMipmapLevelClamp != 0) {
+    return hipErrorNotSupported;
+  }
+  // We don't program the max_lod field in the HW sampler SRD.
+  if (pTexDesc->maxMipmapLevelClamp != 0) {
+    return hipErrorNotSupported;
+  }
+
+  // TODO ROCclr assumes all dimensions have the same addressing mode.
+  cl_addressing_mode addressMode = CL_ADDRESS_NONE;
+  // If hipTextureDesc::normalizedCoords is set to zero,
+  // hipAddressModeWrap and hipAddressModeMirror won't be supported
+  // and will be switched to hipAddressModeClamp.
+  if ((pTexDesc->normalizedCoords == 0) &&
+      ((pTexDesc->addressMode[0] == hipAddressModeWrap) || (pTexDesc->addressMode[0] == hipAddressModeMirror))) {
+    addressMode = hip::getCLAddressingMode(hipAddressModeClamp);
+  }
+  // hipTextureDesc::addressMode is ignored if hipResourceDesc::resType is hipResourceTypeLinear
+  else if (pResDesc->resType != hipResourceTypeLinear) {
+    addressMode = hip::getCLAddressingMode(pTexDesc->addressMode[0]);
+  }
+
+#ifndef CL_FILTER_NONE
+#define CL_FILTER_NONE 0x1142
+#endif
+  cl_filter_mode filterMode = CL_FILTER_NONE;
+#undef CL_FILTER_NONE
+  // hipTextureDesc::filterMode is ignored if hipResourceDesc::resType is hipResourceTypeLinear.
+  if (pResDesc->resType != hipResourceTypeLinear) {
+    filterMode = hip::getCLFilterMode(pTexDesc->filterMode);
+  }
+
+#ifndef CL_FILTER_NONE
+#define CL_FILTER_NONE 0x1142
+#endif
+  cl_filter_mode mipFilterMode = CL_FILTER_NONE;
+#undef CL_FILTER_NONE
+  if (pResDesc->resType == hipResourceTypeMipmappedArray) {
+    mipFilterMode = hip::getCLFilterMode(pTexDesc->mipmapFilterMode);
+  }
+
+  amd::Sampler* sampler = new amd::Sampler(*hip::getCurrentDevice()->asContext(),
+                                           pTexDesc->normalizedCoords,
+                                           addressMode,
+                                           filterMode,
+                                           mipFilterMode,
+                                           pTexDesc->minMipmapLevelClamp,
+                                           pTexDesc->maxMipmapLevelClamp);
+
+  if (sampler == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+
+  if (!sampler->create()) {
+    delete sampler;
+    return hipErrorOutOfMemory;
+  }
+
+  amd::Image* image = nullptr;
+  switch (pResDesc->resType) {
+  case hipResourceTypeArray: {
+    cl_mem memObj = reinterpret_cast<cl_mem>(pResDesc->res.array.array->data);
+    if (!is_valid(memObj)) {
+      return hipErrorInvalidValue;
+    }
+    image = as_amd(memObj)->asImage();
+
+    hipTextureReadMode readMode = pTexDesc->readMode;
+    // 32-bit integer format will not be promoted, regardless of whether or not
+    // this hipTextureDesc::readMode is set hipReadModeNormalizedFloat is specified.
+    if ((pResDesc->res.array.array->Format == HIP_AD_FORMAT_SIGNED_INT32) ||
+        (pResDesc->res.array.array->Format == HIP_AD_FORMAT_UNSIGNED_INT32)) {
+      readMode = hipReadModeElementType;
+    }
+
+    // We need to create an image view if the user requested to use normalized pixel values,
+    // due to already having the image created with a different format.
+    if ((pResViewDesc != nullptr) ||
+        (readMode == hipReadModeNormalizedFloat) ||
+        (pTexDesc->sRGB == 1)) {
+      // TODO ROCclr currently right now can only change the format of the image.
+      const cl_channel_order channelOrder = (pResViewDesc != nullptr) ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) :
+                                                                        hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
+      const cl_channel_type channelType = (pResViewDesc != nullptr) ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) :
+                                                                      hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
+      const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
+      if (!imageFormat.isValid()) {
+        return hipErrorInvalidValue;
+      }
+
+      image = image->createView(*hip::getCurrentDevice()->asContext(), imageFormat, nullptr);
+      if (image == nullptr) {
+        return hipErrorInvalidValue;
+      }
+    }
+    break;
+  }
+  case hipResourceTypeMipmappedArray: {
+    ShouldNotReachHere();
+    break;
+  }
+  case hipResourceTypeLinear: {
+    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.linear.desc), pTexDesc->sRGB);
+    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.linear.desc), pTexDesc->readMode);
+    const amd::Image::Format imageFormat({channelOrder, channelType});
+    const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
+    const size_t imageSizeInBytes = pResDesc->res.linear.sizeInBytes;
+    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.linear.devPtr, imageSizeInBytes);
+    image = ihipImageCreate(channelOrder,
+                            channelType,
+                            imageType,
+                            imageSizeInBytes / imageFormat.getElementSize(), /* imageWidth */
+                            0, /* imageHeight */
+                            0, /* imageDepth */
+                            0, /* imageArraySize */
+                            0, /* imageRowPitch */
+                            0, /* imageSlicePitch */
+                            0, /* numMipLevels */
+                            buffer);
+    buffer->release();
+    if (image == nullptr) {
+      return hipErrorInvalidValue;
+    }
+    break;
+  }
+  case hipResourceTypePitch2D: {
+    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB);
+    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
+    const amd::Image::Format imageFormat({channelOrder, channelType});
+    const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
+    const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
+                                    pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1);
+    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
+    image = ihipImageCreate(channelOrder,
+                            channelType,
+                            imageType,
+                            pResDesc->res.pitch2D.width, /* imageWidth */
+                            pResDesc->res.pitch2D.height, /* imageHeight */
+                            0, /* imageDepth */
+                            0, /* imageArraySize */
+                            pResDesc->res.pitch2D.pitchInBytes, /* imageRowPitch */
+                            0, /* imageSlicePitch */
+                            0, /* numMipLevels */
+                            buffer);
+    buffer->release();
+    if (image == nullptr) {
+      return hipErrorInvalidValue;
+    }
+    break;
+  }
+  }
+
+  void *texObjectBuffer = nullptr;
+  ihipMalloc(&texObjectBuffer, sizeof(__hip_texture), CL_MEM_SVM_FINE_GRAIN_BUFFER);
+  if (texObjectBuffer == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  *pTexObject = new (texObjectBuffer) __hip_texture{image, sampler, *pResDesc, *pTexDesc, (pResViewDesc != nullptr) ? *pResViewDesc : hipResourceViewDesc{}};
+
+  return hipSuccess;
+}
+
+hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
+                                  const hipResourceDesc* pResDesc,
+                                  const hipTextureDesc* pTexDesc,
+                                  const hipResourceViewDesc* pResViewDesc) {
+  HIP_INIT_API(hipCreateTextureObject, pTexObject, pResDesc, pTexDesc, pResViewDesc);
+
+  HIP_RETURN(ihipCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+
+hipError_t ihipDestroyTextureObject(hipTextureObject_t texObject) {
+  if (texObject == nullptr) {
+    return hipSuccess;
+  }
+
+  const hipResourceType type = texObject->resDesc.resType;
+  const bool isImageFromBuffer = (type == hipResourceTypeLinear) || (type == hipResourceTypePitch2D);
+  const bool isImageView = ((type == hipResourceTypeArray) || (type == hipResourceTypeMipmappedArray)) &&
+                           !texObject->image->isParent();
+  if (isImageFromBuffer || isImageView) {
+    texObject->image->release();
+  }
+
+  // TODO Should call ihipFree() to not polute the api trace.
+  return hipFree(texObject);
+}
+
+hipError_t hipDestroyTextureObject(hipTextureObject_t texObject) {
+  HIP_INIT_API(hipDestroyTextureObject, texObject);
+
+  HIP_RETURN(ihipDestroyTextureObject(texObject));
+}
+
+
+hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+                                           hipTextureObject_t texObject) {
+  HIP_INIT_API(hipGetTextureObjectResourceDesc, pResDesc, texObject);
+
+  if ((pResDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pResDesc = texObject->resDesc;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc,
+                                               hipTextureObject_t texObject) {
+  HIP_INIT_API(hipGetTextureObjectResourceViewDesc, pResViewDesc, texObject);
+
+  if ((pResViewDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pResViewDesc = texObject->resViewDesc;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc,
+                                          hipTextureObject_t texObject) {
+  HIP_INIT_API(hipGetTextureObjectTextureDesc, pTexDesc, texObject);
+
+  if ((pTexDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pTexDesc = texObject->texDesc;
+
+  HIP_RETURN(hipSuccess);
+}
+
+inline bool ihipGetTextureAlignmentOffset(size_t* offset,
+                                          const void* devPtr) {
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  const device::Info& info = device->info();
+
+  const char* alignedDevPtr = amd::alignUp(static_cast<const char*>(devPtr), info.imageBaseAddressAlignment_);
+  const size_t alignedOffset = alignedDevPtr - static_cast<const char*>(devPtr);
+
+  // If the device memory pointer was returned from hipMalloc(),
+  // the offset is guaranteed to be 0 and NULL may be passed as the offset parameter.
+  if ((alignedOffset != 0) && (offset == nullptr)) {
+    LogPrintfError("Texture object not aligned with offset %u \n", alignedOffset);
+    return false;
+  }
+
+  if (offset != nullptr) {
+    *offset = alignedOffset;
+  }
+
+  return true;
+}
+
+hipError_t ihipBindTexture(size_t* offset,
+                           const textureReference* texref,
+                           const void* devPtr,
+                           const hipChannelFormatDesc* desc,
+                           size_t size) {
+  if ((texref == nullptr) ||
+      (devPtr == nullptr) ||
+      (desc == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texref->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeLinear;
+  resDesc.res.linear.devPtr = const_cast<void*>(devPtr);
+  resDesc.res.linear.desc = *desc;
+  resDesc.res.linear.sizeInBytes = size;
+
+  if (ihipGetTextureAlignmentOffset(offset, devPtr)) {
+    // Align the user ptr to HW requirments.
+    resDesc.res.linear.devPtr = static_cast<char*>(const_cast<void*>(devPtr)) - *offset;
+  } else {
+    return hipErrorInvalidValue;
+  }
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texref);
+
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, nullptr);
+}
+
+hipError_t ihipBindTexture2D(size_t* offset,
+                             const textureReference* texref,
+                             const void* devPtr,
+                             const hipChannelFormatDesc* desc,
+                             size_t width,
+                             size_t height,
+                             size_t pitch) {
+  if ((texref == nullptr) ||
+      (devPtr == nullptr) ||
+      (desc == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texref->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypePitch2D;
+  resDesc.res.pitch2D.devPtr = const_cast<void*>(devPtr);
+  resDesc.res.pitch2D.desc = *desc;
+  resDesc.res.pitch2D.width = width;
+  resDesc.res.pitch2D.height = height;
+  resDesc.res.pitch2D.pitchInBytes = pitch;
+
+  if (ihipGetTextureAlignmentOffset(offset, devPtr)) {
+    // Align the user ptr to HW requirments.
+    resDesc.res.pitch2D.devPtr = static_cast<char*>(const_cast<void*>(devPtr)) - *offset;
+  } else {
+    return hipErrorInvalidValue;
+  }
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texref);
+
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, nullptr);
+}
+
+hipError_t hipBindTexture2D(size_t* offset,
+                            const textureReference* texref,
+                            const void* devPtr,
+                            const hipChannelFormatDesc* desc,
+                            size_t width,
+                            size_t height,
+                            size_t pitch) {
+  HIP_INIT_API(hipBindTexture2D, offset, texref, devPtr, desc, width, height, pitch);
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
+                                                               &refDevSize));
+
+  assert(refDevSize == sizeof(textureReference));
+  hipError_t err = ihipBindTexture2D(offset, texref, devPtr, desc, width, height, pitch);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t ihipBindTextureToArray(const textureReference* texref,
+                                  hipArray_const_t array,
+                                  const hipChannelFormatDesc* desc) {
+  if ((texref == nullptr) ||
+      (array == nullptr) ||
+      (desc == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texref->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeArray;
+  resDesc.res.array.array = const_cast<hipArray_t>(array);
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texref);
+
+  hipResourceViewFormat format = hip::getResourceViewFormat(*desc);
+  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(array, format);
+
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, &resViewDesc);
+}
+
+hipError_t hipBindTextureToArray(const textureReference* texref,
+                                 hipArray_const_t array,
+                                 const hipChannelFormatDesc* desc) {
+  HIP_INIT_API(hipBindTextureToArray, texref, array, desc);
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
+                                                               &refDevSize));
+
+  assert(refDevSize == sizeof(textureReference));
+  hipError_t err = ihipBindTextureToArray(texref, array, desc);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref,
+                                           hipMipmappedArray_const_t mipmappedArray,
+                                           const hipChannelFormatDesc* desc) {
+  if ((texref == nullptr) ||
+      (mipmappedArray == nullptr) ||
+      (desc == nullptr)) {
+    return hipErrorInvalidValue;
+  }
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texref->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeMipmappedArray;
+  resDesc.res.mipmap.mipmap = const_cast<hipMipmappedArray_t>(mipmappedArray);
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texref);
+
+  hipResourceViewFormat format = hip::getResourceViewFormat(*desc);
+  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(mipmappedArray, format);
+
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, &resViewDesc);
+}
+
+hipError_t hipBindTextureToMipmappedArray(const textureReference* texref,
+                                          hipMipmappedArray_const_t mipmappedArray,
+                                          const hipChannelFormatDesc* desc) {
+  HIP_INIT_API(hipBindTextureToMipmappedArray, texref, mipmappedArray, desc);
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
+                                                               &refDevSize));
+
+  assert(refDevSize == sizeof(textureReference));
+  hipError_t err = ihipBindTextureToMipmappedArray(texref, mipmappedArray, desc);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t hipUnbindTexture(const textureReference* texref) {
+  HIP_INIT_API(hipUnbindTexture, texref);
+
+  if (texref == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  const hipTextureObject_t textureObject = texref->textureObject;
+  const_cast<textureReference*>(texref)->textureObject = nullptr;
+
+  HIP_RETURN(ihipDestroyTextureObject(textureObject));
+}
+
+hipError_t hipBindTexture(size_t* offset,
+                          const textureReference* texref,
+                          const void* devPtr,
+                          const hipChannelFormatDesc* desc,
+                          size_t size) {
+  HIP_INIT_API(hipBindTexture, offset, texref, devPtr, desc, size);
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
+                                                               &refDevSize));
+  assert(refDevSize == sizeof(textureReference));
+  hipError_t err = ihipBindTexture(offset, texref, devPtr, desc, size);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc,
+                             hipArray_const_t array) {
+  HIP_INIT_API(hipGetChannelDesc, desc, array);
+
+  if ((desc == nullptr) || (array == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // It is UB to call hipGetChannelDesc() on an array created via hipArrayCreate()/hipArray3DCreate().
+  // This is due to hip not differentiating between runtime and driver types.
+  *desc = array->desc;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGetTextureAlignmentOffset(size_t* offset,
+                                        const textureReference* texref) {
+  HIP_INIT_API(hipGetTextureAlignmentOffset, offset, texref);
+
+  if ((offset == nullptr) || (texref == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // TODO enforce alignment on devPtr.
+  *offset = 0;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol) {
+  HIP_INIT_API(hipGetTextureReference, texref, symbol);
+
+  if (texref == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  *texref = reinterpret_cast<const textureReference *>(symbol);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetFormat(textureReference* texRef,
+                              hipArray_Format fmt,
+                              int NumPackedComponents) {
+  HIP_INIT_API(hipTexRefSetFormat, texRef, fmt, NumPackedComponents);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->format = fmt;
+  texRef->numChannels = NumPackedComponents;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetFlags(textureReference* texRef,
+                             unsigned int Flags) {
+  HIP_INIT_API(hipTexRefSetFlags, texRef, Flags);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->readMode = hipReadModeNormalizedFloat;
+  texRef->normalized = 0;
+  texRef->sRGB = 0;
+
+  if (Flags & HIP_TRSF_READ_AS_INTEGER) {
+    texRef->readMode = hipReadModeElementType;
+  }
+
+  if (Flags & HIP_TRSF_NORMALIZED_COORDINATES) {
+    texRef->normalized = 1;
+  }
+
+  if (Flags & HIP_TRSF_SRGB) {
+    texRef->sRGB = 1;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetFilterMode(textureReference* texRef,
+                                  hipTextureFilterMode fm) {
+  HIP_INIT_API(hipTexRefSetFilterMode, texRef, fm);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->filterMode = fm;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* pam,
+                                   const textureReference* texRef,
+                                   int dim) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetAddressMode, pam, texRef, dim);
+
+  if ((pam == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Currently, the only valid value for dim are 0 and 1.
+  if ((dim != 0) && (dim != 1)) {
+    LogPrintfError(
+        "Currently only 2 dimensions (0,1) are valid,"
+        "dim : %d \n",
+        dim);
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pam = texRef->addressMode[dim];
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetAddressMode(textureReference* texRef,
+                                   int dim,
+                                   hipTextureAddressMode am) {
+  HIP_INIT_API(hipTexRefSetAddressMode, texRef, dim, am);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if ((dim < 0) || (dim > 2)) {
+    LogPrintfError(
+        "Currently only 3 dimensions (0,1,2) are valid,"
+        "dim : %d \n",
+        dim);
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->addressMode[dim] = am;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetArray(hipArray_t* pArray,
+                             const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetArray, pArray, texRef);
+
+  if ((pArray == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipResourceDesc resDesc = {};
+  // TODO use ihipGetTextureObjectResourceDesc() to not pollute the API trace.
+  hipError_t error = hipGetTextureObjectResourceDesc(&resDesc, texRef->textureObject);
+  if (error != hipSuccess) {
+    HIP_RETURN(error);
+  }
+
+  switch (resDesc.resType) {
+  case hipResourceTypeLinear:
+  case hipResourceTypePitch2D:
+  case hipResourceTypeMipmappedArray: {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  case hipResourceTypeArray:
+    *pArray = resDesc.res.array.array;
+    break;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetArray(textureReference* texRef,
+                             hipArray_const_t array,
+                             unsigned int flags) {
+  HIP_INIT_API(hipTexRefSetArray, texRef, array, flags);
+
+  if ((texRef == nullptr) || (array == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (flags != HIP_TRSA_OVERRIDE_FORMAT) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+
+  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
+  assert(refDevSize == sizeof(textureReference));
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texRef->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeArray;
+  resDesc.res.array.array = const_cast<hipArray_t>(array);
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texRef);
+
+  hipResourceViewFormat format = hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
+  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(array, format);
+
+  hipError_t err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, &resViewDesc);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr,
+                               const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetAddress, dptr, texRef);
+
+  if ((dptr == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipResourceDesc resDesc = {};
+  // TODO use ihipGetTextureObjectResourceDesc() to not pollute the API trace.
+  hipError_t error = hipGetTextureObjectResourceDesc(&resDesc, texRef->textureObject);
+  if (error != hipSuccess) {
+    LogPrintfError("hipGetTextureObjectResourceDesc failed with error code: %s \n",
+                   hipGetErrorName(error));
+    HIP_RETURN(error);
+  }
+
+  switch (resDesc.resType) {
+  // Need to verify.
+  // If the texture reference is not bound to any device memory range,
+  // return hipErroInvalidValue.
+  case hipResourceTypeArray:
+  case hipResourceTypeMipmappedArray: {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  case hipResourceTypeLinear:
+    *dptr = resDesc.res.linear.devPtr;
+    break;
+  case hipResourceTypePitch2D:
+    *dptr = resDesc.res.pitch2D.devPtr;
+    break;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetAddress(size_t* ByteOffset,
+                               textureReference* texRef,
+                               hipDeviceptr_t dptr,
+                               size_t bytes) {
+  HIP_INIT_API(hipTexRefSetAddress, ByteOffset, texRef, dptr, bytes);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
+  assert(refDevSize == sizeof(textureReference));
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texRef->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeLinear;
+  resDesc.res.linear.devPtr = dptr;
+  resDesc.res.linear.desc = hip::getChannelFormatDesc(texRef->numChannels, texRef->format);
+  resDesc.res.linear.sizeInBytes = bytes;
+
+  if (ihipGetTextureAlignmentOffset(ByteOffset, dptr)) {
+    // Align the user ptr to HW requirments.
+    resDesc.res.linear.devPtr = static_cast<char*>(dptr) - *ByteOffset;
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texRef);
+
+  hipError_t err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, nullptr);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t hipTexRefSetAddress2D(textureReference* texRef,
+                                 const HIP_ARRAY_DESCRIPTOR* desc,
+                                 hipDeviceptr_t dptr,
+                                 size_t Pitch) {
+  HIP_INIT_API(hipTexRefSetAddress2D, texRef, desc, dptr, Pitch);
+
+  if ((texRef == nullptr) || (desc == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
+  assert(refDevSize == sizeof(textureReference));
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texRef->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypePitch2D;
+  resDesc.res.linear.devPtr = dptr;
+  resDesc.res.linear.desc = hip::getChannelFormatDesc(desc->NumChannels, desc->Format); // Need to verify.
+  resDesc.res.pitch2D.width = desc->Width;
+  resDesc.res.pitch2D.height = desc->Height;
+  resDesc.res.pitch2D.pitchInBytes = Pitch;
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texRef);
+
+  hipError_t err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, nullptr);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) {
+  return {x, y, z, w, f};
+}
+
+hipError_t hipTexRefGetBorderColor(float* pBorderColor,
+                                   const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetBorderColor, pBorderColor, texRef);
+
+  if ((pBorderColor == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // TODO add textureReference::borderColor.
+  assert(false && "textureReference::borderColor is missing in header");
+  // std::memcpy(pBorderColor, texRef.borderColor, sizeof(texRef.borderColor));
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetFilterMode(hipTextureFilterMode* pfm,
+                                  const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetFilterMode, pfm, texRef);
+
+  if ((pfm == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pfm = texRef->filterMode;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetFlags(unsigned int* pFlags,
+                             const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetFlags, pFlags, texRef);
+
+  if ((pFlags == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pFlags = 0;
+
+  if (texRef->readMode == hipReadModeElementType) {
+    *pFlags |= HIP_TRSF_READ_AS_INTEGER;
+  }
+
+  if (texRef->normalized == 1) {
+    *pFlags |= HIP_TRSF_NORMALIZED_COORDINATES;
+  }
+
+  if (texRef->sRGB == 1) {
+    *pFlags |= HIP_TRSF_SRGB;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetFormat(hipArray_Format* pFormat,
+                              int* pNumChannels,
+                              const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetFormat, pFormat, pNumChannels, texRef);
+
+  if ((pFormat == nullptr) || (pNumChannels == nullptr) ||
+      (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pFormat = texRef->format;
+  *pNumChannels = texRef->numChannels;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAnsio,
+                                     const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetMaxAnisotropy, pmaxAnsio, texRef);
+
+  if ((pmaxAnsio == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pmaxAnsio = texRef->maxAnisotropy;
+
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+hipError_t hipTexRefGetMipmapFilterMode(hipTextureFilterMode* pfm,
+                                        const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetMipmapFilterMode, pfm, texRef);
+
+  if ((pfm == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pfm = texRef->mipmapFilterMode;
+
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+hipError_t hipTexRefGetMipmapLevelBias(float* pbias,
+                                       const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetMipmapLevelBias, pbias, texRef);
+
+  if ((pbias == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pbias = texRef->mipmapLevelBias;
+
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+hipError_t hipTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp,
+                                        float* pmaxMipmapLevelClamp,
+                                        const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetMipmapLevelClamp, pminMipmapLevelClamp, pmaxMipmapLevelClamp, texRef);
+
+  if ((pminMipmapLevelClamp == nullptr) || (pmaxMipmapLevelClamp == nullptr) ||
+      (texRef == nullptr)){
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pminMipmapLevelClamp = texRef->minMipmapLevelClamp;
+  *pmaxMipmapLevelClamp = texRef->maxMipmapLevelClamp;
+
+  HIP_RETURN(hipErrorInvalidValue);
+}
+
+hipError_t hipTexRefGetMipmappedArray(hipMipmappedArray_t* pArray,
+                                      const textureReference* texRef) {
+  // TODO overload operator<<(ostream&, textureReference&).
+  HIP_INIT_API(hipTexRefGetMipmappedArray, pArray, &texRef);
+
+  if ((pArray == nullptr) || (texRef == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipResourceDesc resDesc = {};
+  // TODO use ihipGetTextureObjectResourceDesc() to not pollute the API trace.
+  hipError_t error = hipGetTextureObjectResourceDesc(&resDesc, texRef->textureObject);
+  if (error != hipSuccess) {
+    HIP_RETURN(error);
+  }
+
+  switch (resDesc.resType) {
+  case hipResourceTypeLinear:
+  case hipResourceTypePitch2D:
+  case hipResourceTypeArray: {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  case hipResourceTypeMipmappedArray:
+    *pArray = resDesc.res.mipmap.mipmap;
+    break;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetBorderColor(textureReference* texRef,
+                                   float* pBorderColor) {
+  HIP_INIT_API(hipTexRefSetBorderColor, texRef, pBorderColor);
+
+  if ((texRef == nullptr) || (pBorderColor == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // TODO add textureReference::borderColor.
+  assert(false && "textureReference::borderColor is missing in header");
+  // std::memcpy(texRef.borderColor, pBorderColor, sizeof(texRef.borderColor));
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetMaxAnisotropy(textureReference* texRef,
+                                     unsigned int maxAniso) {
+  HIP_INIT_API(hipTexRefSetMaxAnisotropy, texRef, maxAniso);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->maxAnisotropy = maxAniso;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetMipmapFilterMode(textureReference* texRef,
+                                        hipTextureFilterMode fm) {
+  HIP_INIT_API(hipTexRefSetMipmapFilterMode, texRef, fm);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->mipmapFilterMode = fm;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetMipmapLevelBias(textureReference* texRef,
+                                       float bias) {
+  HIP_INIT_API(hipTexRefSetMipmapLevelBias, texRef, bias);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->mipmapLevelBias = bias;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetMipmapLevelClamp(textureReference* texRef,
+                                        float minMipMapLevelClamp,
+                                        float maxMipMapLevelClamp) {
+  HIP_INIT_API(hipTexRefSetMipmapLevelClamp, minMipMapLevelClamp, maxMipMapLevelClamp);
+
+  if (texRef == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  texRef->minMipmapLevelClamp = minMipMapLevelClamp;
+  texRef->maxMipmapLevelClamp = maxMipMapLevelClamp;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
+                                      hipMipmappedArray* mipmappedArray,
+                                      unsigned int Flags) {
+  HIP_INIT_API(hipTexRefSetMipmappedArray, texRef, mipmappedArray, Flags);
+
+  if ((texRef == nullptr) || (mipmappedArray == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (Flags != HIP_TRSA_OVERRIDE_FORMAT) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceptr_t refDevPtr = nullptr;
+  size_t refDevSize = 0;
+  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
+  assert(refDevSize == sizeof(textureReference));
+
+  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
+  // Any memory previously bound to hTexRef is unbound.
+  // No need to check for errors.
+  ihipDestroyTextureObject(texRef->textureObject);
+
+  hipResourceDesc resDesc = {};
+  resDesc.resType = hipResourceTypeMipmappedArray;
+  resDesc.res.mipmap.mipmap = mipmappedArray;
+
+  hipTextureDesc texDesc = hip::getTextureDesc(texRef);
+
+  hipResourceViewFormat format = hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
+  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(mipmappedArray, format);
+
+  hipError_t err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, &resViewDesc);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+  // Copy to device.
+  amd::HostQueue* queue = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+}
+
+hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
+                              const HIP_RESOURCE_DESC* pResDesc,
+                              const HIP_TEXTURE_DESC* pTexDesc,
+                              const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
+  HIP_INIT_API(hipTexObjectCreate, pTexObject, pResDesc, pTexDesc, pResViewDesc);
+
+  if ((pTexObject == nullptr) || (pResDesc == nullptr) || (pTexDesc == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipResourceDesc resDesc = hip::getResourceDesc(*pResDesc);
+  hipTextureDesc texDesc = hip::getTextureDesc(*pTexDesc);
+
+  if (pResViewDesc != nullptr) {
+    hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(*pResViewDesc);
+    HIP_RETURN(ihipCreateTextureObject(pTexObject, &resDesc, &texDesc, &resViewDesc));
+  } else {
+    HIP_RETURN(ihipCreateTextureObject(pTexObject, &resDesc, &texDesc, nullptr));
+  }
+}
+
+hipError_t hipTexObjectDestroy(hipTextureObject_t texObject) {
+  HIP_INIT_API(hipTexObjectDestroy, texObject);
+
+  HIP_RETURN(ihipDestroyTextureObject(texObject));
+}
+
+hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc,
+                                       hipTextureObject_t texObject) {
+  HIP_INIT_API(hipTexObjectGetResourceDesc, pResDesc, texObject);
+
+  if ((pResDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pResDesc = hip::getResourceDesc(texObject->resDesc);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexObjectGetResourceViewDesc(HIP_RESOURCE_VIEW_DESC* pResViewDesc,
+                                           hipTextureObject_t texObject) {
+  HIP_INIT_API(hipTexObjectGetResourceViewDesc, pResViewDesc, texObject);
+
+  if ((pResViewDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pResViewDesc = hip::getResourceViewDesc(texObject->resViewDesc);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc,
+                                      hipTextureObject_t texObject) {
+  HIP_INIT_API(hipTexObjectGetTextureDesc, pTexDesc, texObject);
+
+  if ((pTexDesc == nullptr) || (texObject == nullptr)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pTexDesc = hip::getTextureDesc(texObject->texDesc);
+
+  HIP_RETURN(hipSuccess);
+}
diff --git a/rocclr/hiprtc_internal.hpp b/rocclr/hiprtc_internal.hpp
new file mode 100644
index 0000000000..4e533c9716
--- /dev/null
+++ b/rocclr/hiprtc_internal.hpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef HIPRTC_SRC_HIP_INTERNAL_H
+#define HIPRTC_SRC_HIP_INTERNAL_H
+
+#include "hip_internal.hpp"
+
+#if __linux__
+#include <cstdlib>
+
+#if HIPRTC_USE_CXXABI
+#include <cxxabi.h>
+
+#define DEMANGLE abi::__cxa_demangle
+
+#else
+extern "C" char * __cxa_demangle(const char *mangled_name, char *output_buffer,
+                                size_t *length, int *status);
+
+#define DEMANGLE __cxa_demangle
+#endif //HIPRTC_USE_CXXABI
+
+#elif defined(_WIN32)
+#include <Windows.h>
+#include <DbgHelp.h>
+
+#define UNDECORATED_SIZE 4096
+
+#endif // __linux__
+
+// This macro should be called at the beginning of every HIP RTC API.
+#define HIPRTC_INIT_API(...)                                 \
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[%zx] %s ( %s )", std::this_thread::get_id(), __func__, ToString( __VA_ARGS__ ).c_str()); \
+  amd::Thread* thread = amd::Thread::current();              \
+  if (!VDI_CHECK_THREAD(thread)) {                           \
+    HIPRTC_RETURN(HIPRTC_ERROR_INTERNAL_ERROR);              \
+  }                                                          \
+  HIP_INIT();
+
+#define HIPRTC_RETURN(ret)             \
+  hiprtc::g_lastRtcError = ret;        \
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "[%zx] %s: Returned %s", std::this_thread::get_id(), __func__, \
+          hiprtcGetErrorString(hiprtc::g_lastRtcError));                 \
+  return hiprtc::g_lastRtcError;
+
+
+#endif // HIPRTC_SRC_HIP_INTERNAL_H
diff --git a/rocclr/trace_helper.h b/rocclr/trace_helper.h
new file mode 100644
index 0000000000..ee4eae68ac
--- /dev/null
+++ b/rocclr/trace_helper.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+//---
+// Helper functions to convert HIP function arguments into strings.
+// Handles POD data types as well as enumerations (ie hipMemcpyKind).
+// The implementation uses C++11 variadic templates and template specialization.
+// The hipMemcpyKind example below is a good example that shows how to implement conversion for a
+// new HSA type.
+
+
+// Handy macro to convert an enumeration to a stringified version of same:
+#define CASE_STR(x)                                                                                \
+    case x:                                                                                        \
+        return #x;
+
+inline const char* ihipErrorString(hipError_t hip_error) {
+    switch (hip_error) {
+        CASE_STR(hipSuccess);
+        CASE_STR(hipErrorOutOfMemory);
+        CASE_STR(hipErrorNotInitialized);
+        CASE_STR(hipErrorDeinitialized);
+        CASE_STR(hipErrorProfilerDisabled);
+        CASE_STR(hipErrorProfilerNotInitialized);
+        CASE_STR(hipErrorProfilerAlreadyStarted);
+        CASE_STR(hipErrorProfilerAlreadyStopped);
+        CASE_STR(hipErrorInvalidImage);
+        CASE_STR(hipErrorInvalidContext);
+        CASE_STR(hipErrorContextAlreadyCurrent);
+        CASE_STR(hipErrorMapFailed);
+        CASE_STR(hipErrorUnmapFailed);
+        CASE_STR(hipErrorArrayIsMapped);
+        CASE_STR(hipErrorAlreadyMapped);
+        CASE_STR(hipErrorNoBinaryForGpu);
+        CASE_STR(hipErrorAlreadyAcquired);
+        CASE_STR(hipErrorNotMapped);
+        CASE_STR(hipErrorNotMappedAsArray);
+        CASE_STR(hipErrorNotMappedAsPointer);
+        CASE_STR(hipErrorECCNotCorrectable);
+        CASE_STR(hipErrorUnsupportedLimit);
+        CASE_STR(hipErrorContextAlreadyInUse);
+        CASE_STR(hipErrorPeerAccessUnsupported);
+        CASE_STR(hipErrorInvalidKernelFile);
+        CASE_STR(hipErrorInvalidGraphicsContext);
+        CASE_STR(hipErrorInvalidSource);
+        CASE_STR(hipErrorFileNotFound);
+        CASE_STR(hipErrorSharedObjectSymbolNotFound);
+        CASE_STR(hipErrorSharedObjectInitFailed);
+        CASE_STR(hipErrorOperatingSystem);
+        CASE_STR(hipErrorSetOnActiveProcess);
+        CASE_STR(hipErrorInvalidHandle);
+        CASE_STR(hipErrorNotFound);
+        CASE_STR(hipErrorIllegalAddress);
+        CASE_STR(hipErrorMissingConfiguration);
+        CASE_STR(hipErrorLaunchFailure);
+        CASE_STR(hipErrorPriorLaunchFailure);
+        CASE_STR(hipErrorLaunchTimeOut);
+        CASE_STR(hipErrorLaunchOutOfResources);
+        CASE_STR(hipErrorInvalidDeviceFunction);
+        CASE_STR(hipErrorInvalidConfiguration);
+        CASE_STR(hipErrorInvalidDevice);
+        CASE_STR(hipErrorInvalidValue);
+        CASE_STR(hipErrorInvalidPitchValue);
+        CASE_STR(hipErrorInvalidDevicePointer);
+        CASE_STR(hipErrorInvalidMemcpyDirection);
+        CASE_STR(hipErrorUnknown);
+        CASE_STR(hipErrorNotReady);
+        CASE_STR(hipErrorNoDevice);
+        CASE_STR(hipErrorPeerAccessAlreadyEnabled);
+        CASE_STR(hipErrorPeerAccessNotEnabled);
+        CASE_STR(hipErrorRuntimeMemory);
+        CASE_STR(hipErrorRuntimeOther);
+        CASE_STR(hipErrorHostMemoryAlreadyRegistered);
+        CASE_STR(hipErrorHostMemoryNotRegistered);
+        CASE_STR(hipErrorTbd);
+        default:
+            return "hipErrorUnknown";
+    };
+};
+
+// Building block functions:
+template <typename T>
+inline std::string ToHexString(T v) {
+    std::ostringstream ss;
+    ss << "0x" << std::hex << v;
+    return ss.str();
+};
+
+template <typename T>
+inline std::string ToString(T* v) {
+    std::ostringstream ss;
+    if (v == NULL) {
+        ss << "char array:<null>";
+    } else {
+        ss << v;
+    }
+    return ss.str();
+};
+
+template <typename T>
+inline std::string ToString(T** v) {
+    std::ostringstream ss;
+    if (v == NULL) {
+        ss << "char array:<null>";
+    } else {
+        ss << v;
+    }
+    return ss.str();
+};
+
+//---
+// Template overloads for ToString to handle specific types
+
+// This is the default which works for most types:
+template <typename T>
+inline std::string ToString(T v) {
+    std::ostringstream ss;
+    ss << v;
+    return ss.str();
+};
+
+template <>
+inline std::string ToString(hipFunction_t v) {
+    std::ostringstream ss;
+    ss << "0x" << std::hex << static_cast<void*>(v);
+    return ss.str();
+};
+
+//  hipEvent_t specialization. TODO - maybe add an event ID for debug?
+template <>
+inline std::string ToString(hipEvent_t v) {
+    std::ostringstream ss;
+    ss << "event:" << std::hex << static_cast<void*>(v);
+    return ss.str();
+};
+//  hipStream_t
+template <>
+inline std::string ToString(hipStream_t v) {
+    std::ostringstream ss;
+    if (v == NULL) {
+        ss << "stream:<null>";
+    } else {
+        ss << "stream:" << std::hex << static_cast<void*>(v);
+    }
+
+    return ss.str();
+};
+
+//  hipCtx_t
+template <>
+inline std::string ToString(hipCtx_t v) {
+    std::ostringstream ss;
+    if (v == NULL) {
+        ss << "context:<null>";
+    } else {
+        ss << "context:" << std::hex << static_cast<void*>(v);
+    }
+
+    return ss.str();
+};
+
+//  hipPitchedPtr
+template <>
+inline std::string ToString(hipPitchedPtr v) {
+    std::ostringstream ss;
+    ss << "pitchPtr:" << std::hex << static_cast<void*>(v.ptr);
+    return ss.str();
+};
+
+//  hipMemcpyKind specialization
+template <>
+inline std::string ToString(hipMemcpyKind v) {
+    switch (v) {
+        CASE_STR(hipMemcpyHostToHost);
+        CASE_STR(hipMemcpyHostToDevice);
+        CASE_STR(hipMemcpyDeviceToHost);
+        CASE_STR(hipMemcpyDeviceToDevice);
+        CASE_STR(hipMemcpyDefault);
+        default:
+            return ToHexString(v);
+    };
+};
+
+template <>
+inline std::string ToString(hipFuncCache_t v) {
+    switch (v) {
+        CASE_STR(hipFuncCachePreferNone);
+        CASE_STR(hipFuncCachePreferShared);
+        CASE_STR(hipFuncCachePreferL1);
+        CASE_STR(hipFuncCachePreferEqual);
+        default:
+            return ToHexString(v);
+    };
+};
+
+template <>
+inline std::string ToString(hipSharedMemConfig v) {
+    switch (v) {
+        CASE_STR(hipSharedMemBankSizeDefault);
+        CASE_STR(hipSharedMemBankSizeFourByte);
+        CASE_STR(hipSharedMemBankSizeEightByte);
+        default:
+            return ToHexString(v);
+    };
+};
+
+template <>
+inline std::string ToString(hipError_t v) {
+    return ihipErrorString(v);
+};
+
+// Catch empty arguments case
+inline std::string ToString() { return (""); }
+
+
+//---
+// C++11 variadic template - peels off first argument, converts to string, and calls itself again to
+// peel the next arg. Strings are automatically separated by comma+space.
+template <typename T, typename... Args>
+inline std::string ToString(T first, Args... args) {
+    return ToString(first) + ", " + ToString(args...);
+}
+
diff --git a/samples/2_Cookbook/15_static_library/host_functions/Makefile b/samples/2_Cookbook/15_static_library/host_functions/Makefile
index 4945075cad..2bbc26727a 100644
--- a/samples/2_Cookbook/15_static_library/host_functions/Makefile
+++ b/samples/2_Cookbook/15_static_library/host_functions/Makefile
@@ -25,7 +25,7 @@ $(HIPCC_EXE): $(EMIT_STATIC_LIB)
 
 # Compiles hipMain1 with g++ and links with libHipOptLibrary.a which contains host function.
 $(HOST_EXE): $(EMIT_STATIC_LIB)
-	$(GXX) $(EMIT_STATIC_MAIN_SRC) -L. -lHipOptLibrary -L$(HIP_PATH)/lib -lamdhip64 -Wl,-rpath=$(HIP_PATH)/lib -o $@
+	$(GXX) $(EMIT_STATIC_MAIN_SRC) -L. -lHipOptLibrary -L$(HIP_PATH)/lib -lamdhip64 -o $@
 
 test: $(HIPCC_EXE) $(HOST_EXE)
 	$(HIPCC_EXE)
diff --git a/tests/catch/CMakeLists.txt b/tests/catch/CMakeLists.txt
index d3b3f028e0..53122d59f0 100644
--- a/tests/catch/CMakeLists.txt
+++ b/tests/catch/CMakeLists.txt
@@ -44,33 +44,15 @@ include_directories(
     ${HIP_PATH}/include
     ${JSON_PARSER}
 )
-
-if(HIP_PLATFORM MATCHES "amd" AND HIP_COMPILER MATCHES "clang")
-    add_compile_options(-Wall -Wextra -pedantic -Werror)
-endif()
-
 cmake_policy(PUSH)
 if(POLICY CMP0037)
     cmake_policy(SET CMP0037 OLD)
 endif()
-
-# Use clang as host compiler with nvcc
-if(HIP_COMPILER MATCHES "nvcc")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ccbin clang")
-endif()
-
-# Disable CXX extensions (gnu++11 etc)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
 add_custom_target(build_tests)
+add_custom_target(test COMMAND ${CMAKE_CTEST_COMMAND})
+add_dependencies(test build_tests)
 
-# Tests folder
 add_subdirectory(unit)
 add_subdirectory(hipTestMain)
 add_subdirectory(stress)
-
-if(UNIX AND HIP_PLATFORM MATCHES "amd")
-    add_subdirectory(multiproc)
-endif()
-
 cmake_policy(POP)
diff --git a/tests/catch/README.md b/tests/catch/README.md
index 9d57c8a59d..38d507c9c0 100644
--- a/tests/catch/README.md
+++ b/tests/catch/README.md
@@ -12,7 +12,7 @@ Tests in Catch2 are declared via ```TEST_CASE```.
 
 ## Taking care of existing features
 - Don’t build on platform: EXCLUDE_(HIP_PLATFORM/HIP_RUNTIME), can be done via CMAKE. Adding source in if(HIP_PLATFORM == amd/nvidia).
-- HIPCC_OPTIONS/CLANG Options: Can be done via: set_source_files_properties(src.cc PROPERTIES COMPILE_FLAGS “…”).
+- HIPCC_OPTIONS/CLANG Options: Can be done via: set_source_files_properties(src.cc PROPERTIES COMPILE_FLAGS “…”).
 - Additional libraries: Can be done via target_link_libraries()
 - Multiple runs with different args: This can be done by Catch’s Feature: GENERATE(…)
 Running Subtest: ctest –R “...” (Regex to match the subtest name)
@@ -31,6 +31,8 @@ Some useful functions are:
 - `bool isLinux()` : true if os is linux
 - `bool isAmd()` : true if platform is AMD
 - `bool isNvidia()` : true if platform is NVIDIA
+- `std::vector<std::string> getDevices()` : returns a vector of strings that contains device names (eg: For AMD: gfx906, gfx908 etc / For NVIDIA: RTX 2070 Super)
+- `std::vector<std::string> getTargetId()` : (AMD Only) returns target id for gpus (eg: gfx906:sramecc+:xnack- etc)
 
 This information can be accessed in any test via using: `TestContext::get().isAmd()`.
 
@@ -70,4 +72,5 @@ Catch2 allows multiple ways in which you can debug the test case.
 
 ## External Libs being used
 - [Catch2](https://github.com/catchorg/Catch2) - Testing framework
-- [picojson](https://github.com/kazuho/picojson) - For config file parsing
+- [taocpp/json](https://github.com/taocpp/json) - For config file parsing
+- [taocpp/PEGTL](https://github.com/taocpp/PEGTL) - Helper lib for taojson
diff --git a/tests/catch/hipTestMain/CMakeLists.txt b/tests/catch/hipTestMain/CMakeLists.txt
index 3ca07fb7c7..e0a7dfb0b6 100644
--- a/tests/catch/hipTestMain/CMakeLists.txt
+++ b/tests/catch/hipTestMain/CMakeLists.txt
@@ -1,33 +1,11 @@
 if(CMAKE_BUILD_TYPE MATCHES "^Debug$")
     add_definitions(-DHT_LOG_ENABLE)
 endif()
-
-add_executable(UnitTests EXCLUDE_FROM_ALL main.cc hip_test_context.cc)
-if(HIP_PLATFORM MATCHES "amd")
-    set_property(TARGET UnitTests PROPERTY CXX_STANDARD 17)
-else()
-    target_compile_options(UnitTests PUBLIC -std=c++17)
-endif()
-
-target_link_libraries(UnitTests PRIVATE DeviceLibs
-                                        MemoryTest
-                                        Kernels
-                                        stdc++fs)
-
-# Add AMD Only Tests
-if(HIP_PLATFORM MATCHES "amd")
-    # target_link_libraries(UnitTests PRIVATE RTC)
-endif()
-
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTAO_PEGTL_STD_EXPERIMENTAL_FILESYSTEM=1")
+add_library(ht_context SHARED EXCLUDE_FROM_ALL hip_test_context.cc)
+set_property(TARGET ht_context PROPERTY CXX_STANDARD 17)
+target_link_libraries(ht_context PRIVATE stdc++fs)
+add_executable(UnitTests EXCLUDE_FROM_ALL main.cc)
+target_link_libraries(UnitTests PRIVATE ht_context DeviceLibs MemoryTest Kernels stdc++fs)
 catch_discover_tests(UnitTests PROPERTIES  SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST")
 add_dependencies(build_tests UnitTests)
-
-# Add Multiproc tests as seperate binary
-if(UNIX AND HIP_PLATFORM MATCHES "amd")
-    add_executable(MultiProcTests EXCLUDE_FROM_ALL main.cc hip_test_context.cc)
-    set_property(TARGET MultiProcTests PROPERTY CXX_STANDARD 17)
-    target_link_libraries(MultiProcTests PRIVATE MultiProc
-                                          stdc++fs)
-    catch_discover_tests(MultiProcTests PROPERTIES  SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST")
-    add_dependencies(build_tests MultiProcTests)
-endif()
diff --git a/tests/catch/hipTestMain/hip_test_context.cc b/tests/catch/hipTestMain/hip_test_context.cc
index 71c2a9bc0e..4fb6b70314 100644
--- a/tests/catch/hipTestMain/hip_test_context.cc
+++ b/tests/catch/hipTestMain/hip_test_context.cc
@@ -15,6 +15,26 @@ namespace fs = std::experimental::filesystem;
 
 #include <regex>
 
+static std::string getValue(std::string option, const std::string& opt_str) {
+  std::string s_opt = opt_str;
+  return s_opt.erase(0, option.size());
+}
+
+static std::string trimName(std::string input, char trim) {
+  auto pos_ = input.find(trim);
+  auto res = input;
+  if (pos_ == std::string::npos) {
+    input = "";
+  } else {
+    res = input.substr(0, pos_);
+    input = input.substr(pos_);
+  }
+  return res;
+}
+
+const std::vector<std::string>& TestContext::getDevices() const { return config_.devices; }
+const std::vector<std::string>& TestContext::getTargetId() const { return config_.targetId; }
+
 void TestContext::detectOS() {
 #if (HT_WIN == 1)
   p_windows = true;
@@ -37,21 +57,18 @@ void TestContext::fillConfig() {
             (env_config != nullptr) ? env_config : "Not found, using default config");
 
   // Check if path has been provided
-  std::string def_config_json = "config.json";
   std::string config_str;
   if (env_config != nullptr) {
     config_str = env_config;
   } else {
-    config_str = def_config_json;
+    config_str = "config.json";
   }
 
   fs::path config_path = config_str;
-  if (config_path.has_parent_path() && config_path.has_filename()) {
+  if (config_path.has_parent_path()) {
     config_.json_file = config_str;
-  } else if (config_path.has_parent_path()) {
-    config_.json_file = config_path / def_config_json;
   } else {
-    config_.json_file = exe_path + def_config_json;
+    config_.json_file = exe_path + config_str;
   }
   LogPrintf("Config file path: %s", config_.json_file.c_str());
 
@@ -62,6 +79,37 @@ void TestContext::fillConfig() {
     LogPrintf("%s", "Either Config or Os is unknown, this wont end well");
     abort();
   }
+
+  int deviceCount = 0;
+  auto res = hipGetDeviceCount(&deviceCount);
+  if (res != hipSuccess) {
+    LogPrintf("HIP Device Count query failed with: %s", hipGetErrorString(res));
+    abort();
+  }
+  if (deviceCount == 0) {
+    LogPrintf("%s", "No hip devices found");
+    abort();
+  }
+  config_.devices.reserve(deviceCount);
+  for (int i = 0; i < deviceCount; i++) {
+    hipDeviceProp_t props;
+    res = hipGetDeviceProperties(&props, i);
+    if (res != hipSuccess) {
+      LogPrintf("HIP Device Count query failed with: %s", hipGetErrorString(res));
+      abort();
+    }
+    if (amd) {
+      std::string tid = std::string(props.gcnArchName);
+      config_.targetId.push_back(tid);
+      std::string dev = trimName(tid, ':');
+      config_.devices.push_back(dev);
+    } else if (nvidia) {
+      config_.devices.push_back(std::string(props.name));
+    } else {
+      LogPrintf("%s", "Unknown platform");
+      abort();
+    }
+  }
 }
 
 TestContext::TestContext(int argc, char** argv) {
@@ -74,7 +122,6 @@ TestContext::TestContext(int argc, char** argv) {
 }
 
 void TestContext::setExePath(int argc, char** argv) {
-  if (argc == 0) return;
   fs::path p = std::string(argv[0]);
   if (p.has_filename()) p.remove_filename();
   exe_path = p.string();
@@ -120,7 +167,9 @@ bool TestContext::parseJsonFile() {
   LogPrintf("Json contents:: %s", json_str.data());
 
   picojson::value v;
-  std::string err = picojson::parse(v, json_str);
+  std::string err;
+  const char* json_end =
+      picojson::parse(v, json_str.data(), json_str.data() + json_str.size(), &err);
   if (err.size() > 1) {
     LogPrintf("Error from PicoJson: %s", err.data());
     return false;
@@ -130,7 +179,6 @@ bool TestContext::parseJsonFile() {
     LogPrintf("%s", "Data in json is not in correct format, it should be an object");
     return false;
   }
-
   const picojson::object &o = v.get<picojson::object>();
   for (picojson::object::const_iterator i = o.begin(); i != o.end(); ++i) {
     // Processing for DisabledTests
diff --git a/tests/catch/hipTestMain/main.cc b/tests/catch/hipTestMain/main.cc
index 886aa7a8dc..c70bf755cf 100644
--- a/tests/catch/hipTestMain/main.cc
+++ b/tests/catch/hipTestMain/main.cc
@@ -6,7 +6,7 @@ int main(int argc, char** argv) {
   auto& context = TestContext::get(argc, argv);
   if (context.skipTest()) {
     // CTest uses this regex to figure out if the test has been skipped
-    std::cout << "HIP_SKIP_THIS_TEST" << std::endl;
+    std::cout << "HIP_SKIP_THIS_TEST" << context.getCurrentTest() << std::endl;
     return 0;
   }
   return Catch::Session().run(argc, argv);
diff --git a/tests/catch/include/hip_test_checkers.hh b/tests/catch/include/hip_test_checkers.hh
deleted file mode 100644
index d0c180c25c..0000000000
--- a/tests/catch/include/hip_test_checkers.hh
+++ /dev/null
@@ -1,164 +0,0 @@
-#pragma once
-#include "hip_test_common.hh"
-
-namespace HipTest {
-template <typename T>
-size_t checkVectors(T* A, T* B, T* Out, size_t N, T (*F)(T a, T b), bool expectMatch = true,
-                    bool reportMismatch = true) {
-  size_t mismatchCount = 0;
-  size_t firstMismatch = 0;
-  size_t mismatchesToPrint = 10;
-  for (size_t i = 0; i < N; i++) {
-    T expected = F(A[i], B[i]);
-    if (Out[i] != expected) {
-      if (mismatchCount == 0) {
-        firstMismatch = i;
-      }
-      mismatchCount++;
-      if ((mismatchCount <= mismatchesToPrint) && expectMatch) {
-        INFO("Mismatch at " << i << " Computed: " << Out[i] << " Expeted: " << expected);
-        CHECK(false);
-      }
-    }
-  }
-
-  if (reportMismatch) {
-    if (expectMatch) {
-      if (mismatchCount) {
-        INFO(mismatchCount << " Mismatches  First Mismatch at index : " << firstMismatch);
-        REQUIRE(false);
-      }
-    } else {
-      if (mismatchCount == 0) {
-        INFO("Expected Mismatch but not found any");
-        REQUIRE(false);
-      }
-    }
-  }
-
-  return mismatchCount;
-}
-
-template <typename T>
-size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch = true,
-                      bool reportMismatch = true) {
-  return checkVectors<T>(
-      A_h, B_h, result_H, N, [](T a, T b) { return a + b; }, expectMatch, reportMismatch);
-}
-
-template <typename T>
-void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch = true) {
-  checkVectors<T>(
-      expected_H, expected_H, result_H, N,
-      [](T a, T b) {
-        guarantee(a == b, "Both values should be equal");
-        return a;
-      },
-      expectMatch);
-}
-
-
-// Setters and Memory Management
-
-template <typename T> void setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h) {
-  // Initialize the host data:
-  for (size_t i = 0; i < numElements; i++) {
-    if (A_h) (A_h)[i] = 3.146f + i;  // Pi
-    if (B_h) (B_h)[i] = 1.618f + i;  // Phi
-    if (C_h) (C_h)[i] = 0.0f + i;
-  }
-}
-
-template <typename T>
-bool initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false) {
-  size_t Nbytes = N * sizeof(T);
-
-  if (usePinnedHost) {
-    if (A_h) {
-      HIPCHECK(hipHostMalloc((void**)A_h, Nbytes));
-    }
-    if (B_h) {
-      HIPCHECK(hipHostMalloc((void**)B_h, Nbytes));
-    }
-    if (C_h) {
-      HIPCHECK(hipHostMalloc((void**)C_h, Nbytes));
-    }
-  } else {
-    if (A_h) {
-      *A_h = (T*)malloc(Nbytes);
-      REQUIRE(*A_h != NULL);
-    }
-
-    if (B_h) {
-      *B_h = (T*)malloc(Nbytes);
-      REQUIRE(*B_h != NULL);
-    }
-
-    if (C_h) {
-      *C_h = (T*)malloc(Nbytes);
-      REQUIRE(*C_h != NULL);
-    }
-  }
-
-  setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL);
-  return true;
-}
-
-template <typename T>
-bool initArrays(T** A_d, T** B_d, T** C_d, T** A_h, T** B_h, T** C_h, size_t N,
-                bool usePinnedHost = false) {
-  size_t Nbytes = N * sizeof(T);
-
-  if (A_d) {
-    HIPCHECK(hipMalloc(A_d, Nbytes));
-  }
-  if (B_d) {
-    HIPCHECK(hipMalloc(B_d, Nbytes));
-  }
-  if (C_d) {
-    HIPCHECK(hipMalloc(C_d, Nbytes));
-  }
-
-  return initArraysForHost(A_h, B_h, C_h, N, usePinnedHost);
-}
-
-template <typename T> bool freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost) {
-  if (usePinnedHost) {
-    if (A_h) {
-      HIPCHECK(hipHostFree(A_h));
-    }
-    if (B_h) {
-      HIPCHECK(hipHostFree(B_h));
-    }
-    if (C_h) {
-      HIPCHECK(hipHostFree(C_h));
-    }
-  } else {
-    if (A_h) {
-      free(A_h);
-    }
-    if (B_h) {
-      free(B_h);
-    }
-    if (C_h) {
-      free(C_h);
-    }
-  }
-  return true;
-}
-
-template <typename T>
-bool freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost) {
-  if (A_d) {
-    HIPCHECK(hipFree(A_d));
-  }
-  if (B_d) {
-    HIPCHECK(hipFree(B_d));
-  }
-  if (C_d) {
-    HIPCHECK(hipFree(C_d));
-  }
-
-  return freeArraysForHost(A_h, B_h, C_h, usePinnedHost);
-}
-}  // namespace HipTest
diff --git a/tests/catch/include/hip_test_common.hh b/tests/catch/include/hip_test_common.hh
index a6e07973f9..b654445486 100644
--- a/tests/catch/include/hip_test_common.hh
+++ b/tests/catch/include/hip_test_common.hh
@@ -1,16 +1,2 @@
-#pragma once
 #include "hip_test_context.hh"
 #include <catch.hpp>
-
-#define HIP_PRINT_STATUS(status) INFO(hipGetErrorName(status) << " at line: " << __LINE__);
-
-#define HIPCHECK(error)                                                                            \
-  {                                                                                                \
-    hipError_t localError = error;                                                                 \
-    if ((localError != hipSuccess) && (localError != hipErrorPeerAccessAlreadyEnabled)) {          \
-      INFO("Error: " << hipGetErrorString(localError) << " Code: " << localError << " Str: "       \
-                     << #error << " In File: " << __FILE__ << " At line: " << __LINE__);           \
-      REQUIRE(false);                                                                              \
-    }                                                                                              \
-  }
-
diff --git a/tests/catch/include/hip_test_context.hh b/tests/catch/include/hip_test_context.hh
index cd81024aa6..6b0100fe12 100644
--- a/tests/catch/include/hip_test_context.hh
+++ b/tests/catch/include/hip_test_context.hh
@@ -34,9 +34,12 @@ static int _log_enable = (std::getenv("HT_LOG_ENABLE") ? 1 : 0);
     }                                                                                              \
   }
 
+
 typedef struct Config_ {
   std::string json_file;              // Json file
   std::string platform;               // amd/nvidia
+  std::vector<std::string> devices;   // gfx906, etc
+  std::vector<std::string> targetId;  // Target Ids, only for AMD, gfx906:sramecc+:xnack-
   std::string os;                     // windows/linux
 } Config;
 
@@ -70,6 +73,8 @@ class TestContext {
   bool isNvidia() const;
   bool isAmd() const;
   bool skipTest() const;
+  const std::vector<std::string>& getDevices() const;
+  const std::vector<std::string>& getTargetId() const;
 
   const std::string& getCurrentTest() const { return current_test; }
   std::string currentPath();
diff --git a/tests/catch/include/hip_test_kernels.hh b/tests/catch/include/hip_test_kernels.hh
deleted file mode 100644
index 7196accd97..0000000000
--- a/tests/catch/include/hip_test_kernels.hh
+++ /dev/null
@@ -1,62 +0,0 @@
-#pragma once
-
-#include <hip/hip_runtime.h>
-
-namespace HipTest {
-template <typename T> __global__ void vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  for (size_t i = offset; i < NELEM; i += stride) {
-    C_d[i] = A_d[i] + B_d[i];
-  }
-}
-
-
-template <typename T>
-__global__ void vectorADDReverse(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) {
-    C_d[i] = A_d[i] + B_d[i];
-  }
-}
-
-
-template <typename T> __global__ void addCount(const T* A_d, T* C_d, size_t NELEM, int count) {
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  // Deliberately do this in an inefficient way to increase kernel runtime
-  for (int i = 0; i < count; i++) {
-    for (size_t i = offset; i < NELEM; i += stride) {
-      C_d[i] = A_d[i] + (T)count;
-    }
-  }
-}
-
-
-template <typename T>
-__global__ void addCountReverse(const T* A_d, T* C_d, int64_t NELEM, int count) {
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  // Deliberately do this in an inefficient way to increase kernel runtime
-  for (int i = 0; i < count; i++) {
-    for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) {
-      C_d[i] = A_d[i] + (T)count;
-    }
-  }
-}
-
-
-template <typename T> __global__ void memsetReverse(T* C_d, T val, int64_t NELEM) {
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  for (int64_t i = NELEM - stride + offset; i >= 0; i -= stride) {
-    C_d[i] = val;
-  }
-}
-}  // namespace HipTest
\ No newline at end of file
diff --git a/tests/catch/multiproc/CMakeLists.txt b/tests/catch/multiproc/CMakeLists.txt
deleted file mode 100644
index a782262380..0000000000
--- a/tests/catch/multiproc/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# AMD Tests
-set(LINUX_TEST_SRC
-    hipMallocConcurrency.cc
-    childMalloc.cc
-)
-
-if(UNIX)
-    # Create shared lib of all tests
-    add_library(MultiProc SHARED EXCLUDE_FROM_ALL ${LINUX_TEST_SRC})
-
-    # Add dependency on build_tests to build it on this custom target
-    add_dependencies(build_tests MultiProc)
-endif()
diff --git a/tests/catch/multiproc/childMalloc.cc b/tests/catch/multiproc/childMalloc.cc
deleted file mode 100644
index 858fd0878e..0000000000
--- a/tests/catch/multiproc/childMalloc.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_kernels.hh>
-
-#ifdef __linux__
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/wait.h>
-#include <dlfcn.h>
-#endif
-
-
-bool testMallocFromChild() {
-  int fd[2];
-  pid_t childpid;
-  bool testResult = false;
-
-  // create pipe descriptors
-  pipe(fd);
-
-  childpid = fork();
-  if (childpid > 0) {  // Parent
-    close(fd[1]);
-    // parent will wait to read the device cnt
-    read(fd[0], &testResult, sizeof(testResult));
-
-    // close the read-descriptor
-    close(fd[0]);
-
-    // wait for child exit
-    wait(NULL);
-
-    return testResult;
-
-  } else if (!childpid) {  // Child
-    // writing only, no need for read-descriptor
-    close(fd[0]);
-
-    char* A_d = nullptr;
-    hipError_t ret = hipMalloc(&A_d, 1024);
-
-    printf("hipMalloc returned : %s\n", hipGetErrorString(ret));
-    if (ret == hipSuccess)
-      testResult = true;
-    else
-      testResult = false;
-
-    // send the value on the write-descriptor:
-    write(fd[1], &testResult, sizeof(testResult));
-
-    // close the write descriptor:
-    close(fd[1]);
-    exit(0);
-  }
-  return false;
-}
-
-
-TEST_CASE("ChildMalloc") {
-  auto res = testMallocFromChild();
-  REQUIRE(res == true);
-}
diff --git a/tests/catch/multiproc/hipMallocConcurrency.cc b/tests/catch/multiproc/hipMallocConcurrency.cc
deleted file mode 100644
index 72d17c26a0..0000000000
--- a/tests/catch/multiproc/hipMallocConcurrency.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-#include <sys/types.h>
-#include <hip_test_checkers.hh>
-#include <hip_test_kernels.hh>
-#ifdef __linux__
-#include <sys/wait.h>
-#include <unistd.h>
-#endif
-#include <iostream>
-#include <vector>
-#include <limits>
-#include <atomic>
-
-
-#include <hip_test_common.hh>
-
-size_t N = 4 * 1024 * 1024;
-unsigned blocksPerCU = 6;  // to hide latency
-unsigned threadsPerBlock = 256;
-
-
-unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) {
-  int device;
-  HIPCHECK(hipGetDevice(&device));
-  hipDeviceProp_t props;
-  HIPCHECK(hipGetDeviceProperties(&props, device));
-
-  unsigned blocks = props.multiProcessorCount * blocksPerCU;
-  if (blocks * threadsPerBlock > N) {
-    blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
-  }
-
-  return blocks;
-}
-
-
-/**
- * Validates data consitency on supplied gpu
- */
-bool validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false) {
-  size_t Nbytes = N * sizeof(int);
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t prevAvl, prevTot, curAvl, curTot;
-  bool TestPassed = true;
-
-  HIPCHECK(hipSetDevice(gpu));
-  HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot));
-  printf("tgs allocating..\n");
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-
-  unsigned blocks = setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
-  HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-
-  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
-                     static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
-
-  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-
-  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
-    printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
-  } else {
-    printf("%s : Validation FAILED for gpu %d from pid %d\n", __func__, gpu, getpid());
-    TestPassed &= false;
-  }
-
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIPCHECK(hipMemGetInfo(&curAvl, &curTot));
-
-  if (!concurOnOneGPU && (prevAvl != curAvl || prevTot != curTot)) {
-    // In concurrent calls on one GPU, we cannot verify leaking in this way
-    printf(
-        "%s : Memory allocation mismatch observed."
-        "Possible memory leak.\n",
-        __func__);
-    TestPassed &= false;
-  }
-
-  return TestPassed;
-}
-
-
-#if 1
-/**
- * Fetches Gpu device count
- */
-void getDeviceCount1(int* pdevCnt) {
-#ifdef __linux__
-  int fd[2], val = 0;
-  pid_t childpid;
-
-  // create pipe descriptors
-  pipe(fd);
-
-  // disable visible_devices env from shell
-  unsetenv("ROCR_VISIBLE_DEVICES");
-  unsetenv("HIP_VISIBLE_DEVICES");
-
-  childpid = fork();
-
-  if (childpid > 0) {  // Parent
-    close(fd[1]);
-    // parent will wait to read the device cnt
-    read(fd[0], &val, sizeof(val));
-
-    // close the read-descriptor
-    close(fd[0]);
-
-    // wait for child exit
-    wait(NULL);
-
-    *pdevCnt = val;
-  } else if (!childpid) {  // Child
-    int devCnt = 1;
-    // writing only, no need for read-descriptor
-    close(fd[0]);
-
-    HIPCHECK(hipGetDeviceCount(&devCnt));
-    // send the value on the write-descriptor:
-    write(fd[1], &devCnt, sizeof(devCnt));
-
-    // close the write descriptor:
-    close(fd[1]);
-    exit(0);
-  } else {  // failure
-    *pdevCnt = 1;
-    return;
-  }
-
-#else
-  HIPCHECK(hipGetDeviceCount(pdevCnt));
-#endif
-}
-#endif
-
-
-TEST_CASE("hipMallocChild_Concurrency_MultiGpu") {
-  bool TestPassed = false;
-#ifdef __linux__
-  // Parallel execution on multiple gpus from different child processes
-  int devCnt = 1, pid = 0;
-
-  // Get GPU count
-  getDeviceCount1(&devCnt);
-
-  // Spawn child for each GPU
-  for (int gpu = 0; gpu < devCnt; gpu++) {
-    if ((pid = fork()) < 0) {
-      INFO("Child_Concurrency_MultiGpu : fork() returned error" << pid);
-      REQUIRE(false);
-
-    } else if (!pid) {  // Child process
-      bool TestPassedChild = false;
-      TestPassedChild = validateMemoryOnGPU(gpu);
-
-      if (TestPassedChild) {
-        printf("returning exit(1) for success\n");
-        exit(1);  // child exit with success status
-      } else {
-        printf("Child_Concurrency_MultiGpu : childpid %d failed\n", getpid());
-        exit(2);  // child exit with failure status
-      }
-    }
-  }
-
-  // Parent shall wait for child to complete
-  int cnt = 0;
-
-  for (int i = 0; i < devCnt; i++) {
-    int pidwait = 0, exitStatus;
-    pidwait = wait(&exitStatus);
-
-    printf("exitStatus for iter:%d is %d\n", i, exitStatus);
-    if (pidwait < 0) {
-      break;
-    }
-
-    if (WEXITSTATUS(exitStatus) == 1) cnt++;
-  }
-
-  if (cnt && (cnt == devCnt)) TestPassed = true;
-
-#else
-  INFO("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux");
-#endif
-  REQUIRE(TestPassed == true);
-}
diff --git a/tests/catch/unit/CMakeLists.txt b/tests/catch/unit/CMakeLists.txt
index cff55f37a8..d913d25a5c 100644
--- a/tests/catch/unit/CMakeLists.txt
+++ b/tests/catch/unit/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_subdirectory(memory)
 add_subdirectory(deviceLib)
 add_subdirectory(kernels)
-# Disable Saxpy test temporarily to see if CI Passes
-# add_subdirectory(rtc)
+add_subdirectory(rtc)
diff --git a/tests/catch/unit/deviceLib/CMakeLists.txt b/tests/catch/unit/deviceLib/CMakeLists.txt
index 22de79c687..421261b518 100644
--- a/tests/catch/unit/deviceLib/CMakeLists.txt
+++ b/tests/catch/unit/deviceLib/CMakeLists.txt
@@ -1,18 +1,9 @@
 # Common Tests - Test independent of all platforms
 set(TEST_SRC
     floatMath.cc
-)
-
-# AMD only tests
-set(AMD_TEST_SRC
     vectorTypesDevice.cc
 )
 
-if(HIP_PLATFORM MATCHES "amd")
-    set(TEST_SRC ${TEST_SRC} ${AMD_TEST_SRC})
-endif()
-
-
 # Create shared lib of all tests
 add_library(DeviceLibs SHARED EXCLUDE_FROM_ALL ${TEST_SRC})
 
diff --git a/tests/catch/unit/kernels/add.cc b/tests/catch/unit/kernels/add.cc
index 4f70ffef77..7adfde51d2 100644
--- a/tests/catch/unit/kernels/add.cc
+++ b/tests/catch/unit/kernels/add.cc
@@ -2,11 +2,12 @@
 #include <iostream>
 
 template <typename T> __global__ void add(T* a, T* b, T* c, size_t size) {
-  size_t i = threadIdx.x;
-  if (i < size) c[i] = a[i] + b[i];
+  int i = threadIdx.x;
+  c[i] = a[i] + b[i];
 }
 
 TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, double) {
+  auto addKernel = add<TestType>;
   auto size = GENERATE(as<size_t>{}, 100, 500, 1000);
   TestType *d_a, *d_b, *d_c;
   auto res = hipMalloc(&d_a, sizeof(TestType) * size);
@@ -17,7 +18,7 @@ TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, d
   REQUIRE(res == hipSuccess);
 
   std::vector<TestType> a, b, c;
-  for (size_t i = 0; i < size; i++) {
+  for (int i = 0; i < size; i++) {
     a.push_back(i + 1);
     b.push_back(i + 1);
     c.push_back(2 * (i + 1));
@@ -28,7 +29,7 @@ TEMPLATE_TEST_CASE("Add Kernel", "[kernel][add]", int, long, float, long long, d
   res = hipMemcpy(d_b, b.data(), sizeof(TestType) * size, hipMemcpyHostToDevice);
   REQUIRE(res == hipSuccess);
 
-  hipLaunchKernelGGL(add<TestType>, 1, size, 0, 0, d_a, d_b, d_c, size);
+  hipLaunchKernelGGL(addKernel, 1, size, 0, 0, d_a, d_b, d_c, size);
 
   res = hipMemcpy(a.data(), d_c, sizeof(TestType) * size, hipMemcpyDeviceToHost);
   REQUIRE(res == hipSuccess);
diff --git a/tests/catch/unit/rtc/CMakeLists.txt b/tests/catch/unit/rtc/CMakeLists.txt
index 062e4153c6..435d372fbc 100644
--- a/tests/catch/unit/rtc/CMakeLists.txt
+++ b/tests/catch/unit/rtc/CMakeLists.txt
@@ -1,12 +1,14 @@
-# AMD Tests
-set(AMD_TEST_SRC
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
     saxpy.cc
 )
 
-if(HIP_PLATFORM MATCHES "amd")
-    # Create shared lib of all tests
-    add_library(RTC SHARED EXCLUDE_FROM_ALL ${AMD_TEST_SRC})
+# Set source File properties
+set_source_files_properties(saxpy.cc PROPERTIES COMPILE_FLAGS " -std=c++14 ")
+set_source_files_properties(test.cc PROPERTIES COMPILE_FLAGS " -std=c++17 ")
 
-    # Add dependency on build_tests to build it on this custom target
-    add_dependencies(build_tests RTC)
-endif()
+# Create shared lib of all tests
+add_library(RTC SHARED EXCLUDE_FROM_ALL ${TEST_SRC})
+
+# Add dependency on build_tests to build it on this custom target
+add_dependencies(build_tests RTC)
diff --git a/tests/catch/unit/rtc/saxpy.cc b/tests/catch/unit/rtc/saxpy.cc
index af7ca24a2f..186349ae00 100644
--- a/tests/catch/unit/rtc/saxpy.cc
+++ b/tests/catch/unit/rtc/saxpy.cc
@@ -15,7 +15,7 @@ static constexpr auto NUM_THREADS{128};
 static constexpr auto NUM_BLOCKS{32};
 
 static constexpr auto saxpy{
-R"(
+    R"(
 #include <hip/hip_runtime.h>
 extern "C"
 __global__
@@ -23,7 +23,8 @@ void saxpy(float a, float* x, float* y, float* out, size_t n)
 {
     size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
         if (tid < n) {
-               out[tid] = a * x[tid] + y[tid];
+               out[tid] = a * x[tid] + y[tid] ;
+                   
         }
         
 }
@@ -71,42 +72,42 @@ TEST_CASE("saxpy", "[hiprtc][saxpy]") {
   unique_ptr<float[]> hX{new float[n]};
   unique_ptr<float[]> hY{new float[n]};
   unique_ptr<float[]> hOut{new float[n]};
-  for (size_t i = 0; i < n; ++i) {
-    hX[i] = static_cast<float>(i);
-    hY[i] = static_cast<float>(i * 2);
-  }
-
-  hipDeviceptr_t dX, dY, dOut;
-  hipMalloc(&dX, bufferSize);
-  hipMalloc(&dY, bufferSize);
-  hipMalloc(&dOut, bufferSize);
-  hipMemcpyHtoD(dX, hX.get(), bufferSize);
-  hipMemcpyHtoD(dY, hY.get(), bufferSize);
-
-  struct {
-    float a_;
-    hipDeviceptr_t b_;
-    hipDeviceptr_t c_;
-    hipDeviceptr_t d_;
-    size_t e_;
-  } args{a, dX, dY, dOut, n};
-
-  auto size = sizeof(args);
-  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-                    HIP_LAUNCH_PARAM_END};
-
-  hipModuleLaunchKernel(kernel, NUM_BLOCKS, 1, 1, NUM_THREADS, 1, 1, 0, nullptr, nullptr, config);
-  hipMemcpyDtoH(hOut.get(), dOut, bufferSize);
-
-  for (size_t i = 0; i < n; ++i) {
-    INFO("For " << i << " Value: " << fabs(a * hX[i] + hY[i] - hOut[i])
-                << " with: " << (fabs(hOut[i] * 1.0f) * 1e-6));
-    REQUIRE(fabs(a * hX[i] + hY[i] - hOut[i]) <= fabs(hOut[i]) * 1e-6);
-  }
-
-  hipFree(dX);
-  hipFree(dY);
-  hipFree(dOut);
-
-  hipModuleUnload(module);
+for (size_t i = 0; i < n; ++i) {
+        hX[i] = static_cast<float>(i);
+        hY[i] = static_cast<float>(i * 2);
+    }
+
+    hipDeviceptr_t dX, dY, dOut;
+    hipMalloc(&dX, bufferSize);
+    hipMalloc(&dY, bufferSize);
+    hipMalloc(&dOut, bufferSize);
+    hipMemcpyHtoD(dX, hX.get(), bufferSize);
+    hipMemcpyHtoD(dY, hY.get(), bufferSize);
+
+    struct {
+        float a_;
+        hipDeviceptr_t b_;
+        hipDeviceptr_t c_;
+        hipDeviceptr_t d_;
+        size_t e_;
+    } args{a, dX, dY, dOut, n};
+
+    auto size = sizeof(args);
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                      HIP_LAUNCH_PARAM_END};
+
+    hipModuleLaunchKernel(kernel, NUM_BLOCKS, 1, 1, NUM_THREADS, 1, 1,
+                          0, nullptr, nullptr, config);
+    hipMemcpyDtoH(hOut.get(), dOut, bufferSize);
+
+    for (size_t i = 0; i < n; ++i) {
+      REQUIRE(fabs(a * hX[i] + hY[i] - hOut[i]) > fabs(hOut[i]) * 1e-6);
+    }
+
+    hipFree(dX);
+    hipFree(dY);
+    hipFree(dOut);
+
+    hipModuleUnload(module);
 }
diff --git a/tests/catch/unit/rtc/test.cc b/tests/catch/unit/rtc/test.cc
new file mode 100644
index 0000000000..3b12610458
--- /dev/null
+++ b/tests/catch/unit/rtc/test.cc
@@ -0,0 +1,6 @@
+#include <hip_test_common.hh>
+
+TEST_CASE("cpp17 test") {
+  constexpr auto l = []() { return 2 * 10 * 30; };
+  REQUIRE(l() == 600);
+}
diff --git a/tests/performance/memory/hipPerfMemFill.cpp b/tests/performance/memory/hipPerfMemFill.cpp
deleted file mode 100644
index 1570c84301..0000000000
--- a/tests/performance/memory/hipPerfMemFill.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
- Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <printf/printf_common.h>
-#include <iostream>
-#include <chrono>
-#include <sys/time.h>
-
-#define SIMPLY_ASSIGN 0
-#define USE_HIPTEST_SETNUMBLOCKS 0
-
-using namespace std;
-
-template<class T>
-__global__ void vec_fill(T *x, T coef, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int i = istart; i < N; i += ishift) {
-#if SIMPLY_ASSIGN
-    x[i] = coef;
-#else
-    x[i] = coef * i;
-#endif
-  }
-}
-
-__device__ void print_log(int i, double value, double expected) {
-  printf("failed at %d: val=%g, expected=%g\n", i, value, expected);
-}
-
-__device__ void print_log(int i, int value, int expected) {
-  printf("failed at %d: val=%d, expected=%d\n", i, value, expected);
-}
-
-template<class T>
-__global__ void vec_verify(T *x, T coef, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int i = istart; i < N; i += ishift) {
-#if SIMPLY_ASSIGN
-    if(x[i] != coef) {
-      print_log(i, x[i], coef);
-    }
-#else
-    if(x[i] != coef * i) {
-      print_log(i, x[i], coef * i);
-    }
-#endif
-  }
-}
-
-template<class T>
-__global__ void daxpy(T *__restrict__ x, T *__restrict__ y,
-    const T coef, int Niter, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int iter = 0; iter < Niter; ++iter) {
-    T iv = coef * iter;
-    for (int i = istart; i < N; i += ishift)
-    y[i] = iv * x[i] + y[i];
-  }
-}
-
-template<class T>
-class hipPerfMemFill {
- private:
-  static constexpr int NUM_START = 27;
-  static constexpr int NUM_SIZE = 5;
-  static constexpr int NUM_ITER = 10;
-  size_t totalSizes_[NUM_SIZE];
-  hipDeviceProp_t props_;
-  const T coef_ = getCoefficient(3.14159);
-  const unsigned int blocksPerCU_;
-  const unsigned int threadsPerBlock_;
-
- public:
-  hipPerfMemFill(unsigned int blocksPerCU, unsigned int threadsPerBlock) :
-    blocksPerCU_(blocksPerCU), threadsPerBlock_(threadsPerBlock) {
-    for (int i = 0; i < NUM_SIZE; i++) {
-      totalSizes_[i] = 1ull << (i + NUM_START); // 128M, 256M, 512M, 1024M, 2048M
-    }
-  }
-
-  ~hipPerfMemFill() {
-  }
-
-  bool supportLargeBar() {
-    return props_.isLargeBar != 0;
-  }
-
-  bool supportManagedMemory() {
-    return props_.managedMemory != 0;
-  }
-
-  const T getCoefficient(double val) {
-    return static_cast<T>(val);
-  }
-
-  void setHostBuffer(T *A, T val, size_t size) {
-    size_t len = size / sizeof(T);
-    for (int i = 0; i < len; i++) {
-      A[i] = val;
-    }
-  }
-
-  void open(int deviceId) {
-    int nGpu = 0;
-    HIPCHECK(hipGetDeviceCount(&nGpu));
-    if (nGpu < 1) {
-      cout << "Info: didn't find any GPU! skipping the test!\n";
-      passed();
-    } else if (deviceId >= nGpu) {
-      failed("Info: wrong GPU Id %d\n", deviceId);
-    }
-
-    HIPCHECK(hipSetDevice(deviceId));
-    HIPCHECK(hipGetDeviceProperties(&props_, deviceId));
-    std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
-        << props_.pciBusID << " " << props_.name << " with "
-        << props_.multiProcessorCount << " CUs, large bar: "
-        << supportLargeBar() << ", managed memory: " << supportManagedMemory()
-        << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained()
-        << std::endl;
-  }
-
-  void log_host(const char* title, double GBytes, double sec) {
-    cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec
-        << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << endl;
-  }
-
-  void log_kernel(const char* title, double GBytes, double sec, double sec_hv, double sec_kv) {
-    cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec
-        << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << ", hostVerify cost "
-        << setw(10) << sec_hv << " s in bandwidth " << setw(10) << GBytes / sec_hv << " [GB/s]"
-        << ", kernelVerify cost "<< setw(10) << sec_kv << " s in bandwidth " << setw(10)
-        << GBytes / sec_kv << " [GB/s]" << endl;
-  }
-
-  void hostFill(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    auto start = chrono::steady_clock::now();
-    for (int i = 0; i < num; ++i) {
-#if SIMPLY_ASSIGN
-      data[i] = coef;
-#else
-      data[i] = coef * i;
-#endif
-    }
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count();
-  }
-
-  void kernelFill(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    unsigned blocks = setNumBlocks(num);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                           dim3(threadsPerBlock), 0, 0, data, 0, num);  // kernel will be loaded first time
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto start = chrono::steady_clock::now();
-
-    for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                             dim3(threadsPerBlock), 0, 0, data, coef, num);
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count() / NUM_ITER;  // in second
-  }
-
-  void hostVerify(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    auto start = chrono::steady_clock::now();
-    for (int i = 0; i < num; ++i) {
-#if SIMPLY_ASSIGN
-      if(data[i] != coef) {
-        cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef << endl;
-        failed("failed\n");
-      }
-#else
-      if(data[i] != coef * i) {
-        cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef * i << endl;
-        failed("failed\n");
-      }
-#endif
-    }
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count();
-  }
-
-  void kernelVerify(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    unsigned blocks = setNumBlocks(num);
-
-    CaptureStream *capture = new CaptureStream(stdout);
-    capture->Begin();
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                       dim3(threadsPerBlock), 0, 0, data, coef, num);  // kernel will be loaded first time
-    HIPCHECK(hipDeviceSynchronize());
-
-    capture->End();
-    capture->Truncate(1000); // Don't want too long log if existing
-    std::string device_output = capture->getData();
-    delete capture;
-    if (device_output.length() > 0) {
-      failed("kernelVerify failed:\n%s\n", device_output.c_str());
-    }
-
-    // Now all data verified. The following is to test bandwidth.
-    auto start = chrono::steady_clock::now();
-
-    for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                             dim3(threadsPerBlock), 0, 0, data, coef, num);
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count() / NUM_ITER;  // in second
-  }
-
-  bool testLargeBarDeviceMemoryHostFill(size_t size) {
-    if (!supportLargeBar()) {
-      return false;
-    }
-
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMalloc(&A, size));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access device mem in LB
-    HIPCHECK(hipFree(A));
-
-    log_host("Largebar: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testLargeBar() {
-    if (!supportLargeBar()) {
-      return false;
-    }
-
-    cout << "Test large bar device memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testLargeBarDeviceMemoryHostFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  bool testManagedMemoryHostFill(size_t size) {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMallocManaged(&A, size));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access HMM mem
-    HIPCHECK(hipFree(A));
-
-    log_host("Managed: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testManagedMemoryKernelFill(size_t size) {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMallocManaged(&A, size));
-
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);  // Managed memory can be verified by host
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipFree(A));
-
-    log_kernel("Managed: kernel fill", GBytes, sec, sec_hv, sec_kv);
-
-    return true;
-  }
-
-  bool testManagedMemory() {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-
-    cout << "Test managed memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testManagedMemoryHostFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    cout << "Test managed memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testManagedMemoryKernelFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  bool testHostMemoryHostFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-    T *A;
-    HIPCHECK(hipHostMalloc(&A, size, flags));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);
-    HIPCHECK(hipHostFree(A));
-
-    log_host("Host: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testHostMemoryKernelFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipHostMalloc((void** ) &A, size, flags));
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipHostFree(A));
-
-    log_kernel("Host: kernel fill", GBytes, sec, sec_hv, sec_kv);
-    return true;
-  }
-
-  bool testHostMemory() {
-    cout << "Test coherent host memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test non-coherent host memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocNonCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test coherent host memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test non-coherent host memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocNonCoherent)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  /* This fuction should be via device attribute query*/
-  bool supportDeviceMallocFinegrained() {
-    T *A = nullptr;
-    hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained);
-    if (!A) {
-      return false;
-    }
-    HIPCHECK(hipFree(A));
-    return true;
-  }
-
-  unsigned int setNumBlocks(size_t size) {
-    size_t num = size/sizeof(T);
-
-#if USE_HIPTEST_SETNUMBLOCKS
-    return HipTest::setNumBlocks(blocksPerCU_, threadsPerBlock_,
-                                 num);
-#else
-    return (num + threadsPerBlock_ - 1) / threadsPerBlock_;
-#endif
-  }
-
-  bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A = nullptr;
-    HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags));
-    if (!A) {
-      cout << "failed hipExtMallocWithFlags() with size =" << size << " flags="
-           << std::hex << flags << endl;
-      return false;
-    }
-
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access this mem
-    HIPCHECK(hipFree(A));
-
-    log_host("ExtDevice: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testExtDeviceMemoryKernelFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A = nullptr;
-    HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags));
-    if (!A) {
-      cout << "failed hipExtMallocWithFlags() with size =" << size << " flags="
-           << std::hex << flags << endl;
-      return false;
-    }
-
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);  // Fine grained device memory can be verified by host
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipFree(A));
-
-    log_kernel("ExtDevice: kernel fill", GBytes, sec, sec_hv, sec_kv);
-
-    return true;
-  }
-
-  bool testExtDeviceMemory() {
-    cout << "Test fine grained device memory host filling"
-        << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryHostFill(totalSizes_[i],
-                                       hipDeviceMallocFinegrained)) {
-        return false;
-      }
-    }
-
-    cout << "Test fine grained device memory kernel filling"
-        << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryKernelFill(totalSizes_[i],
-                                         hipDeviceMallocFinegrained)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  bool run() {
-    if (supportLargeBar()) {
-      if (!testLargeBar()) {
-        return false;
-      }
-    }
-
-    if (supportManagedMemory()) {
-      if (!testManagedMemory()) {
-        return false;
-      }
-    }
-
-    if (!testHostMemory()) {
-      return false;
-    }
-
-    if (supportDeviceMallocFinegrained()) {
-      if (!testExtDeviceMemory()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-};
-
-int main(int argc, char *argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true); // For ::p_gpuDevice, ::blocksPerCU, ::threadsPerBlock
-  cout << "Test int" << endl;
-  hipPerfMemFill<int> hipPerfMemFillInt(::blocksPerCU, ::threadsPerBlock);
-  hipPerfMemFillInt.open(::p_gpuDevice);
-  HIPASSERT(hipPerfMemFillInt.run());
-
-  cout << "Test double" << endl;
-  hipPerfMemFill<double> hipPerfMemFillDouble(::blocksPerCU, ::threadsPerBlock);
-  hipPerfMemFillDouble.open(::p_gpuDevice);
-  HIPASSERT(hipPerfMemFillDouble.run());
-
-  passed();
-}
diff --git a/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
index 857b165577..43691882a9 100644
--- a/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
+++ b/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
@@ -114,10 +114,7 @@ __device__ void double_precision_math_functions() {
     scalbn(0.0, 1);
     signbit(1.0);
     sin(0.0);
-#if not(defined(__HIP_PLATFORM_NVIDIA__) && (CUDA_VERSION == 11030 || CUDA_VERSION == 11020))
-    //NV A100 has a bug in sincos(), so temporarily disbale it
     sincos(0.0, &fX, &fY);
-#endif
     sincospi(0.0, &fX, &fY);
     sinh(0.0);
     sinpi(0.0);
diff --git a/tests/src/deviceLib/hipIntegerIntrinsics.cpp b/tests/src/deviceLib/hipIntegerIntrinsics.cpp
index 2074e27192..42d218acc5 100644
--- a/tests/src/deviceLib/hipIntegerIntrinsics.cpp
+++ b/tests/src/deviceLib/hipIntegerIntrinsics.cpp
@@ -46,10 +46,6 @@ __device__ void integer_intrinsics() {
     __clzll((long long)10);
     __ffs((int)10);
     __ffsll((long long)10);
-    __funnelshift_l((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_lc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_r((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_rc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
     __hadd((int)1, (int)3);
     __mul24((int)1, (int)2);
     __mul64hi((long long)1, (long long)2);
diff --git a/tests/src/deviceLib/hipTestClock.cpp b/tests/src/deviceLib/hipTestClock.cpp
index d3661b67ea..b5bb5d9a05 100644
--- a/tests/src/deviceLib/hipTestClock.cpp
+++ b/tests/src/deviceLib/hipTestClock.cpp
@@ -31,21 +31,21 @@ THE SOFTWARE.
 #define HIP_ASSERT(status) assert(status == hipSuccess)
 
 #define LEN 512
-#define SIZE (LEN * sizeof(long long))
+#define SIZE 2048
 
-  static __global__ void kernel1(long long* Ad) {
+  static __global__ void kernel1(int* Ad) {
       int tid = threadIdx.x + blockIdx.x * blockDim.x;
       Ad[tid] = clock() + clock64() + __clock() + __clock64();
   }
 
-  static __global__ void kernel2(long long* Ad) {
+  static __global__ void kernel2(int* Ad) {
       int tid = threadIdx.x + blockIdx.x * blockDim.x;
       Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
   }
 
   void run() {
-    long long *A, *Ad;
-    A = new long long[LEN];
+    int *A, *Ad;
+    A = new int[LEN];
     for (unsigned i = 0; i < LEN; i++) {
         A[i] = 0;
     }
diff --git a/tests/src/deviceLib/hip_funnelshift.cpp b/tests/src/deviceLib/hip_funnelshift.cpp
deleted file mode 100644
index 7c77548f7a..0000000000
--- a/tests/src/deviceLib/hip_funnelshift.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <iostream>
-#include <hip/hip_runtime.h>
-#include <hip/device_functions.h>
-
-#define HIP_ASSERT(x) (assert((x) == hipSuccess))
-
-#define NUM_TESTS 65
-
-#define HI_INT 0xfacefeed
-#define LO_INT 0xdeadbeef
-
-__global__ void funnelshift_kernel(unsigned int* l_out, unsigned int* lc_out,
-        unsigned int* r_out, unsigned int* rc_out) {
-
-    for (int i = 0; i < NUM_TESTS; i++) {
-        l_out[i] = __funnelshift_l(LO_INT, HI_INT, i);
-        lc_out[i] = __funnelshift_lc(LO_INT, HI_INT, i);
-        r_out[i] = __funnelshift_r(LO_INT, HI_INT, i);
-        rc_out[i] = __funnelshift_rc(LO_INT, HI_INT, i);
-    }
-}
-
-static unsigned int cpu_funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
-{
-    // Concatenate hi:lo
-    uint64_t val = hi;
-    val <<= 32;
-    val |= lo;
-    // left shift by intput & 31
-    val <<= (shift & 31);
-    // pull out upper 32 bits and return them
-    val >>= 32;
-    return val & 0xffffffff;
-}
-
-static unsigned int cpu_funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
-{
-    // Concatenate hi:lo
-    uint64_t val = hi;
-    val <<= 32;
-    val |= lo;
-    // left shift by min(input,32)
-    if (shift > 32)
-        shift = 32;
-    val <<= shift;
-    // pull out upper 32 bits and return them
-    val >>= 32;
-    return val & 0xffffffff;
-}
-
-static unsigned int cpu_funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
-{
-    // Concatenate hi:lo
-    uint64_t val = hi;
-    val <<= 32;
-    val |= lo;
-    // right shift by intput & 31
-    val >>= (shift & 31);
-    // return lower 32 bits
-    return val & 0xffffffff;
-}
-
-static unsigned int cpu_funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
-{
-    // Concatenate hi:lo
-    uint64_t val = hi;
-    val <<= 32;
-    val |= lo;
-    // left shift by min(input, 32)
-    if (shift > 32)
-        shift = 32;
-    val >>= shift;
-    // return lower 32 bits
-    return val & 0xffffffff;
-}
-
-using namespace std;
-
-int main() {
-    unsigned int *host_l_output;
-    unsigned int *host_lc_output;
-    unsigned int *host_r_output;
-    unsigned int *host_rc_output;
-
-    unsigned int *device_l_output;
-    unsigned int *device_lc_output;
-    unsigned int *device_r_output;
-    unsigned int *device_rc_output;
-
-    unsigned int *golden_l;
-    unsigned int *golden_lc;
-    unsigned int *golden_r;
-    unsigned int *golden_rc;
-
-    hipDeviceProp_t devProp;
-    hipGetDeviceProperties(&devProp, 0);
-    cout << " System minor " << devProp.minor << endl;
-    cout << " System major " << devProp.major << endl;
-    cout << " agent prop name " << devProp.name << endl;
-
-    cout << "hip Device prop succeeded " << endl;
-
-
-    int i;
-    int errors;
-
-    host_l_output = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    host_lc_output = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    host_r_output = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    host_rc_output = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-
-    golden_l = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    golden_lc = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    golden_r = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-    golden_rc = (unsigned int*)calloc(NUM_TESTS, sizeof(unsigned int));
-
-    for (int i = 0; i < NUM_TESTS; i++) {
-        golden_l[i] = cpu_funnelshift_l(LO_INT, HI_INT, i);
-        golden_lc[i] = cpu_funnelshift_lc(LO_INT, HI_INT, i);
-        golden_r[i] = cpu_funnelshift_r(LO_INT, HI_INT, i);
-        golden_rc[i] = cpu_funnelshift_rc(LO_INT, HI_INT, i);
-    }
-
-    HIP_ASSERT(hipMalloc((void**)&device_l_output, NUM_TESTS * sizeof(unsigned int)));
-    HIP_ASSERT(hipMalloc((void**)&device_lc_output, NUM_TESTS * sizeof(unsigned int)));
-    HIP_ASSERT(hipMalloc((void**)&device_r_output, NUM_TESTS * sizeof(unsigned int)));
-    HIP_ASSERT(hipMalloc((void**)&device_rc_output, NUM_TESTS * sizeof(unsigned int)));
-
-    hipLaunchKernelGGL(funnelshift_kernel, dim3(1), dim3(1), 0, 0,
-            device_l_output, device_lc_output, device_r_output,
-            device_rc_output);
-
-    HIP_ASSERT(hipMemcpy(host_l_output, device_l_output, NUM_TESTS * sizeof(unsigned int), hipMemcpyDeviceToHost));
-    HIP_ASSERT(hipMemcpy(host_lc_output, device_lc_output, NUM_TESTS * sizeof(unsigned int), hipMemcpyDeviceToHost));
-    HIP_ASSERT(hipMemcpy(host_r_output, device_r_output, NUM_TESTS * sizeof(unsigned int), hipMemcpyDeviceToHost));
-    HIP_ASSERT(hipMemcpy(host_rc_output, device_rc_output, NUM_TESTS * sizeof(unsigned int), hipMemcpyDeviceToHost));
-
-    // verify the results
-    errors = 0;
-    printf("HI val: 0x%x\n", HI_INT);
-    printf("LO val: 0x%x\n", LO_INT);
-
-    for (i = 0; i < NUM_TESTS; i++) {
-        printf("gpu_funnelshift_l(%d) = 0x%x, cpu_funnelshift_l(%d) = 0x%x\n",
-                i, host_l_output[i], i, golden_l[i]);
-        if (host_l_output[i] != golden_l[i]) {
-            errors++;
-            printf("\tERROR!\n");
-        }
-    }
-    if (errors != 0) {
-        cout << "FAILED: funnelshift_l" << endl;
-        return -1;
-    } else {
-        cout << "funnelshift_l checked!" << endl;
-    }
-
-    errors = 0;
-    for (i = 0; i < NUM_TESTS; i++) {
-        printf("gpu_funnelshift_lc(%d) = 0x%x, cpu_funnelshift_lc(%d) = 0x%x\n",
-                i, host_lc_output[i], i, golden_lc[i]);
-        if (host_lc_output[i] != golden_lc[i]) {
-            errors++;
-            printf("\tERROR!\n");
-        }
-    }
-    if (errors != 0) {
-        cout << "FAILED: funnelshift_lc" << endl;
-        return -1;
-    } else {
-        cout << "funnelshift_lc checked!" << endl;
-    }
-
-    errors = 0;
-    for (i = 0; i < NUM_TESTS; i++) {
-        printf("gpu_funnelshift_r(%d) = 0x%x, cpu_funnelshift_r(%d) = 0x%x\n",
-                i, host_r_output[i], i, golden_r[i]);
-        if (host_r_output[i] != golden_r[i]) {
-            errors++;
-            printf("\tERROR!\n");
-        }
-    }
-    if (errors != 0) {
-        cout << "FAILED: funnelshift_r" << endl;
-        return -1;
-    } else {
-        cout << "funnelshift_r checked!" << endl;
-    }
-
-    errors = 0;
-    for (i = 0; i < NUM_TESTS; i++) {
-        printf("gpu_funnelshift_rc(%d) = 0x%x, cpu_funnelshift_rc(%d) = 0x%x\n",
-                i, host_rc_output[i], i, golden_rc[i]);
-        if (host_rc_output[i] != golden_rc[i]) {
-            errors++;
-            printf("\tERROR!\n");
-        }
-    }
-    if (errors != 0) {
-        cout << "FAILED: funnelshift_rc" << endl;
-        return -1;
-    } else {
-        cout << "funnelshift_rc checked!" << endl;
-    }
-    errors = 0;
-
-    cout << "funnelshift tests PASSED!" << endl;
-
-    HIP_ASSERT(hipFree(device_l_output));
-    HIP_ASSERT(hipFree(device_lc_output));
-    HIP_ASSERT(hipFree(device_r_output));
-    HIP_ASSERT(hipFree(device_rc_output));
-
-    free(host_l_output);
-    free(host_lc_output);
-    free(host_r_output);
-    free(host_rc_output);
-
-    return errors;
-}
diff --git a/tests/src/g++/hipMalloc.cpp b/tests/src/g++/hipMalloc.cpp
index 1ab421a34c..da1fcb24fa 100644
--- a/tests/src/g++/hipMalloc.cpp
+++ b/tests/src/g++/hipMalloc.cpp
@@ -18,7 +18,7 @@
  * */
 
 /* HIT_START
- * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_AMD__ -I%hip-path/include -I%rocm-path/include %S/%s -Wl,--rpath=%rocm-path/lib %hip-path/lib/libamdhip64.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvidia EXCLUDE_HIP_LIB_TYPE static
+ * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_AMD__ -I%hip-path/include -I%rocm-path/include %S/%s -Wl,--rpath=%roc-path/lib %hip-path/lib/libamdhip64.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvidia EXCLUDE_HIP_LIB_TYPE static
  * TEST: %t EXCLUDE_HIP_PLATFORM nvidia EXCLUDE_HIP_LIB_TYPE static
  * HIT_END
  */
diff --git a/tests/src/printf/printf_common.h b/tests/src/printf/printf_common.h
index d8cce83f0d..4f75b13d62 100644
--- a/tests/src/printf/printf_common.h
+++ b/tests/src/printf/printf_common.h
@@ -134,21 +134,6 @@ struct CaptureStream {
       assert(false);
     }
   }
-
-  // Truncate the file up to size if we don't want too long log
-  void Truncate(size_t size) {
-    struct stat sb = { 0 };
-    if (::stat(tempname, &sb) == -1) {
-      failed("failed lstat(%s) with error: %s \n", tempname, ::strerror(errno));
-      return;
-    }
-    if (sb.st_size > size) {
-      if (::truncate(tempname, static_cast<off_t>(size)) == -1) {
-        failed("failed truncate(%s) with error: %s \n", tempname, ::strerror(errno));
-        return;
-      }
-    }
-  }
 };
 #endif
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp
index 239d552708..171838eb11 100644
--- a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp
@@ -20,7 +20,7 @@ THE SOFTWARE.
 // Simple test for hipLaunchCooperativeKernelMultiDevice API.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_70,code=sm_70
  * TEST: %t
  * HIT_END
  */
@@ -123,7 +123,7 @@ int main() {
     HIPCHECK(hipMalloc((void**)&dA[i], SIZE));
     HIPCHECK(hipMalloc((void**)&dB[i], 64 * deviceProp[i].multiProcessorCount * sizeof(long)));
     if (i == 0) {
-      HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long)));
+      HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent));
     }
     HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice));
     HIPCHECK(hipStreamCreate(&stream[i]));
@@ -175,9 +175,6 @@ int main() {
 
     system_clock::time_point start = system_clock::now();
     hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
-    for (int i = 0; i < nGpu; i++) {
-      hipStreamSynchronize(stream[i]);
-    }
     system_clock::time_point end = system_clock::now();
     std::chrono::duration<double> elapsed_seconds = end - start;
     end_time = std::chrono::system_clock::to_time_t(end);
diff --git a/tests/src/runtimeApi/graph/hipGraph.cpp b/tests/src/runtimeApi/graph/hipGraph.cpp
index 6fa38add56..a964ec07ff 100644
--- a/tests/src/runtimeApi/graph/hipGraph.cpp
+++ b/tests/src/runtimeApi/graph/hipGraph.cpp
@@ -24,8 +24,9 @@
 #include <vector>
 /* HIT_START
  * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * TEST: %t EXCLUDE_HIP_PLATFORM all
  * HIT_END
+
  */
 #define THREADS_PER_BLOCK 512
 #define GRAPH_LAUNCH_ITERATIONS 3
@@ -244,6 +245,9 @@ bool hipGraphsManual(float* inputVec_h, float* inputVec_d, double* outputVec_d,
 int main(int argc, char** argv) {
   size_t size = 1 << 12;  // number of elements to reduce
   size_t maxBlocks = 512;
+  // This will pick the best possible CUDA capable device
+  int devID = 1;  // TODO: implement: findCudaDevice(argc, (const char**)argv); based of max GFLOPS
+                  // incase of multiple devic
   hipSetDevice(0);  //
   printf("%zu elements\n", size);
   printf("threads per block  = %d\n", THREADS_PER_BLOCK);
diff --git a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
index 7f491958c5..8676706c53 100644
--- a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
+++ b/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
@@ -197,7 +197,7 @@ bool NegativeTestsMallocManaged(int NumDevices) {
   // success and contradicts with api doc.
 
   // With size(0), api expected to return error code (or)
-  // reset ptr while returning success (to accommodate cuda 11.2 api behavior).
+  // reset ptr while returning success (to accomadate cuda 11.2 api behavior).
   err = hipMallocManaged(&A, 0, hipMemAttachGlobal);
   if ((hipErrorInvalidValue == err) ||
       ((hipSuccess == err) && (nullptr == A))) {
@@ -213,33 +213,16 @@ bool NegativeTestsMallocManaged(int NumDevices) {
     IfTestPassed = false;
   }
 
-  err = hipMallocManaged(NULL, 1024, hipMemAttachHost);
+#ifdef __HIP_PLATFORM_AMD__
+  // The flag hipMemAttachHost is currently not supported therefore
+  // api should return "hipErrorInvalidValue" for now
+  err = hipMallocManaged(&A, 1024, hipMemAttachHost);
   if (hipErrorInvalidValue != err) {
     printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n",
            hipGetErrorString(err));
     IfTestPassed = false;
   }
-
-  // cuda api doc says : If size is 0, cudaMallocManaged returns
-  // cudaErrorInvalidValue. However, it is observed that cuda 11.2 api returns
-  // success and contradicts with api doc.
-
-  // With size(0), api expected to return error code (or)
-  // reset ptr while returning success (to accommodate cuda 11.2 api behavior).
-  err = hipMallocManaged(&A, 0, hipMemAttachHost);
-  if ((hipErrorInvalidValue == err) ||
-      ((hipSuccess == err) && (nullptr == A))) {
-    IfTestPassed &= true;
-  } else {
-    IfTestPassed = false;
-  }
-
-  err = hipMallocManaged(NULL, 0, hipMemAttachHost);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
+#endif  // __HIP_PLATFORM_AMD__
 
   err = hipMallocManaged(NULL, 0, 0);
   if (hipErrorInvalidValue != err) {
diff --git a/tests/src/runtimeApi/module/hipManagedKeyword.cpp b/tests/src/runtimeApi/module/hipManagedKeyword.cpp
deleted file mode 100644
index c2b6b81691..0000000000
--- a/tests/src/runtimeApi/module/hipManagedKeyword.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-Copyright (c) 2021-present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD_CMD: managed_kernel.code %hc --genco %S/managed_kernel.cpp -o managed_kernel.code EXCLUDE_HIP_PLATFORM amd
- * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia EXCLUDE_HIP_PLATFORM amd
- * TEST: %t
- * HIT_END
- */
-
-#include "hip/hip_runtime.h"
-#include <iostream>
-#include "test_common.h"
-
-#define MANAGED_VAR_INIT_VALUE 10
-#define fileName "managed_kernel.code"
-
-bool managedMultiGPUTest() {
-  int numDevices = 0;
-  hipDeviceptr_t x;
-  size_t xSize;
-  int data;
-  hipGetDeviceCount(&numDevices);
-  for (int i = 0; i < numDevices; i++) {
-    hipSetDevice(i);
-    hipModule_t Module;
-    HIPCHECK(hipModuleLoad(&Module, fileName));
-    hipFunction_t Function;
-    HIPCHECK(hipModuleGetFunction(&Function, Module, "GPU_func"));
-    HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, 1, 1, 1, 0, 0, NULL, NULL));
-    hipDeviceSynchronize();
-    HIPCHECK(hipModuleGetGlobal((hipDeviceptr_t*)&x, &xSize, Module, "x"));
-    HIPCHECK(hipMemcpyDtoH(&data, hipDeviceptr_t(x), xSize));
-    if (data != (1 + MANAGED_VAR_INIT_VALUE)) {
-      HIPCHECK(hipModuleUnload(Module));
-      return false;
-    }
-    HIPCHECK(hipModuleUnload(Module));
-  }
-  return true;
-}
-
-int main(int argc, char** argv) {
-  hipInit(0);
-  bool testStatus = managedMultiGPUTest();
-  if (!testStatus) {
-    failed("Managed keyword module test failed!");
-  }
-  passed();
-}
diff --git a/tests/src/runtimeApi/module/hipModuleLaunchKernel.cpp b/tests/src/runtimeApi/module/hipModuleLaunchKernel.cpp
index 550a52e41c..40cbccd2d0 100644
--- a/tests/src/runtimeApi/module/hipModuleLaunchKernel.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLaunchKernel.cpp
@@ -227,7 +227,11 @@ bool Module_GridBlock_Corner_Tests() {
   bool testStatus = true;
   HIPCHECK(hipSetDevice(0));
   hipError_t err;
+  struct {
+  } args1;
   hipFunction_t DummyKernel;
+  size_t size1;
+  size1 = sizeof(args1);
   hipModule_t Module;
   hipStream_t stream1;
   hipDeviceptr_t *Ad;
@@ -239,6 +243,9 @@ bool Module_GridBlock_Corner_Tests() {
 #endif
   HIPCHECK(hipModuleLoad(&Module, fileName));
   HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1,
+    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+    HIP_LAUNCH_PARAM_END};
   HIPCHECK(hipStreamCreate(&stream1));
   // Passing Max int value to block dimensions
   hipDeviceProp_t deviceProp;
@@ -270,7 +277,8 @@ bool Module_GridBlock_Corner_Tests() {
                                 test[i].blockY,
                                 test[i].blockZ,
                                 0,
-                                stream1, NULL, NULL);
+                                stream1, NULL,
+                                reinterpret_cast<void**>(&config1));
     if (err != hipSuccess) {
       printf("hipModuleLaunchKernel failed (%u, %u, %u) and (%u, %u, %u)",
       test[i].gridX, test[i].gridY, test[i].gridZ,
@@ -290,7 +298,11 @@ bool Module_WorkGroup_Test() {
   bool testStatus = true;
   HIPCHECK(hipSetDevice(0));
   hipError_t err;
+  struct {
+  } args1;
   hipFunction_t DummyKernel;
+  size_t size1;
+  size1 = sizeof(args1);
   hipModule_t Module;
   hipStream_t stream1;
   hipDeviceptr_t *Ad;
@@ -302,6 +314,9 @@ bool Module_WorkGroup_Test() {
 #endif
   HIPCHECK(hipModuleLoad(&Module, fileName));
   HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1,
+    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+    HIP_LAUNCH_PARAM_END};
   HIPCHECK(hipStreamCreate(&stream1));
   // Passing Max int value to block dimensions
   hipDeviceProp_t deviceProp;
@@ -315,7 +330,8 @@ bool Module_WorkGroup_Test() {
   err = hipModuleLaunchKernel(DummyKernel,
                             1, 1, 1,
                             cuberoot_floor, cuberoot_floor, cuberoot_floor,
-                            0, stream1, NULL, NULL);
+                            0, stream1, NULL,
+                            reinterpret_cast<void**>(&config1));
   if (err != hipSuccess) {
     printf("hipModuleLaunchKernel failed block dimensions (%u, %u, %u)",
            cuberoot_floor, cuberoot_floor, cuberoot_floor);
@@ -326,7 +342,8 @@ bool Module_WorkGroup_Test() {
   err = hipModuleLaunchKernel(DummyKernel,
                             1, 1, 1,
                             cuberoot_ceil, cuberoot_ceil, cuberoot_ceil + 1,
-                            0, stream1, NULL, NULL);
+                            0, stream1, NULL,
+                            reinterpret_cast<void**>(&config1));
   if (err == hipSuccess) {
     printf("hipModuleLaunchKernel failed block dimensions (%u, %u, %u)",
            cuberoot_ceil, cuberoot_ceil, cuberoot_ceil);
diff --git a/tests/src/texture/hipTextureMipmapObj2D.cpp b/tests/src/texture/hipTextureMipmapObj2D.cpp
index b6ac46d91e..a4980806f1 100644
--- a/tests/src/texture/hipTextureMipmapObj2D.cpp
+++ b/tests/src/texture/hipTextureMipmapObj2D.cpp
@@ -1,27 +1,5 @@
-/*
-Copyright (c) 2019-present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */

From f64659ae48dab39e574d34b80282b3f446a45df3 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 27 Jan 2022 05:17:05 +0000
Subject: [PATCH 02/38] SWDEV-1 - Bump patch version to 20270

Change-Id: I820cb2048b4964858bfe9e9fc4db21ace7de31f9
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 961f45bb68..e59039868c 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "0";
-$HIP_BASE_VERSION_PATCH = "0";
+$HIP_BASE_VERSION_PATCH = "20270";
 
 #---
 # Function to parse config file

From b5229a0b932ac7fdeafa0a6d9809410f96b50fa9 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 11 Feb 2022 08:08:37 +0000
Subject: [PATCH 03/38] SWDEV-1 - Bump patch version to 20420

Change-Id: Id6a5a7918915619b6865b103760b08ec721d82af
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index e59039868c..861a8b4e39 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "0";
-$HIP_BASE_VERSION_PATCH = "20270";
+$HIP_BASE_VERSION_PATCH = "20420";
 
 #---
 # Function to parse config file

From 773ec02bb0bf76e9bafe4944879d45f3ef7b6785 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 22 Feb 2022 10:50:55 +0000
Subject: [PATCH 04/38] SWDEV-1 - Bump patch version to 20530

Change-Id: I05308b6dca72f34da59f3512d14d984fd6b723f3
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index c2925dfb87..7507bf0b2e 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "1";
-$HIP_BASE_VERSION_PATCH = "20420";
+$HIP_BASE_VERSION_PATCH = "20530";
 
 #---
 # Function to parse config file

From c7d94539d3d83331f2dd0c62f6a6956d6db22db5 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 25 Feb 2022 14:27:24 +0000
Subject: [PATCH 05/38] SWDEV-1 - Bump patch version to 20560

Change-Id: I5419061723073fdf39fc7fcbbbc402e131bb94e1
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 7507bf0b2e..7d0b04b22c 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "1";
-$HIP_BASE_VERSION_PATCH = "20530";
+$HIP_BASE_VERSION_PATCH = "20560";
 
 #---
 # Function to parse config file

From 0a9f2c863b245cf75cc443245a849de5d6411c85 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Mon, 14 Mar 2022 06:33:28 +0000
Subject: [PATCH 06/38] SWDEV-1 - Bump patch version to 20590

Change-Id: I52d3968c906f6386bb445b48d0330093321e0ce1
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 7d0b04b22c..3b12ae7f98 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "1";
-$HIP_BASE_VERSION_PATCH = "20560";
+$HIP_BASE_VERSION_PATCH = "20590";
 
 #---
 # Function to parse config file

From bda991b9081ac381e2c93be79479455c71ee08ca Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 25 Mar 2022 05:56:28 +0000
Subject: [PATCH 07/38] SWDEV-1 - Bump patch version to 20730

Change-Id: I075df408387b3829a49ea730c85c36ac9a2eddd9
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index bb795d2707..85a384a959 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "2";
-$HIP_BASE_VERSION_PATCH = "20590";
+$HIP_BASE_VERSION_PATCH = "20730";
 
 #---
 # Function to parse config file

From 33df4f49a31f9564e0a1550e9a7366bdb28c9e8c Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 19 May 2022 09:11:25 +0000
Subject: [PATCH 08/38] SWDEV-1 - Bump patch version to 21310

Change-Id: Id4c34e6bba5eca4d550c9d755835e2d23f0244bc
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index a4c41ba360..151c42db45 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "2";
-$HIP_BASE_VERSION_PATCH = "20730";
+$HIP_BASE_VERSION_PATCH = "21310";
 
 #---
 # Function to parse config file

From 7a160e323e943799ddb1f855d1195ff7803c9f0c Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 26 May 2022 09:53:41 +0000
Subject: [PATCH 09/38] SWDEV-1 - Bump patch version to 21400

Change-Id: I33bc16fb77e8bdf919de032b221a7d46831287ac
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 151c42db45..5ce5c6c99a 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "2";
-$HIP_BASE_VERSION_PATCH = "21310";
+$HIP_BASE_VERSION_PATCH = "21400";
 
 #---
 # Function to parse config file

From df0d560e442b7e94caa4de7e0418c6b45d9a10d0 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 21 Jun 2022 08:49:24 +0000
Subject: [PATCH 10/38] SWDEV-1 - Bump patch version to 21530

Change-Id: I7f606f340ec903ba30ce2b7b119b91cfcbae387d
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index dc075211b6..a98f6739f0 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "21400";
+$HIP_BASE_VERSION_PATCH = "21530";
 
 #---
 # Function to parse config file

From 81311e36b03e507b4778e51b6904ed11331b2e9f Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 5 Jul 2022 06:06:11 +0000
Subject: [PATCH 11/38] SWDEV-1 - Bump patch version to 21780

Change-Id: I6be92cb41392d5d0d790ad2f959d643368f723bb
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index a98f6739f0..bf42c29294 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "21530";
+$HIP_BASE_VERSION_PATCH = "21780";
 
 #---
 # Function to parse config file

From 8a680f4cb84a9a93d0a57f3096d501ca4ac881d0 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 15 Jul 2022 08:39:42 +0000
Subject: [PATCH 12/38] SWDEV-1 - Bump patch version to 21850

Change-Id: Idb79c40f3edbd60f0a03f8ab00ccdb62f2231334
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index bf42c29294..15650882d9 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "21780";
+$HIP_BASE_VERSION_PATCH = "21850";
 
 #---
 # Function to parse config file

From fdc46ed1a38ebfbe1494d12569b03d67e5825c4d Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 22 Jul 2022 09:28:17 +0000
Subject: [PATCH 13/38] SWDEV-1 - Bump patch version to 22000

Change-Id: I0612a7af01a096e4349c6f72636acc202ae5650e
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 15650882d9..104648a66a 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "21850";
+$HIP_BASE_VERSION_PATCH = "22000";
 
 #---
 # Function to parse config file

From 59bcca20972d5e3ebcd0aac1de85926ac50338d2 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 29 Jul 2022 08:51:17 +0000
Subject: [PATCH 14/38] SWDEV-1 - Bump patch version to 22060

Change-Id: Ib02ad932a205d10439dbb7efd50f07fe717b3f4a
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 104648a66a..c9690762f9 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "22000";
+$HIP_BASE_VERSION_PATCH = "22060";
 
 #---
 # Function to parse config file

From 53d16b58f34a0a798cf80b02931d66ac6061ed30 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 8 Sep 2022 14:20:19 +0000
Subject: [PATCH 15/38] SWDEV-1 - Bump patch version to 22360

Change-Id: I5f56324923e530306cacb2c91e7cfddf9979abb0
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index c9690762f9..1f9300b3da 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "3";
-$HIP_BASE_VERSION_PATCH = "22060";
+$HIP_BASE_VERSION_PATCH = "22360";
 
 #---
 # Function to parse config file

From ddfea75919d10895392a51dc9af43a07683cbb9f Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 20 Sep 2022 08:00:27 +0000
Subject: [PATCH 16/38] SWDEV-1 - Bump patch version to 22520

Change-Id: Icf8c4ffac9161183fdcc3dfc4ddfc8c03be5c3ff
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 2141d5bb13..dac6b4964a 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "4";
-$HIP_BASE_VERSION_PATCH = "22360";
+$HIP_BASE_VERSION_PATCH = "22520";
 
 #---
 # Function to parse config file

From bb8417599a2a47426ca6ac5b5db55d385a72c9f6 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Mon, 3 Oct 2022 16:40:32 +0000
Subject: [PATCH 17/38] SWDEV-1 - Bump patch version to 22740

Change-Id: I7ef87c5916355757c4fd01ff49f8bb521abeaab4
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index dac6b4964a..bdf7b27988 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "4";
-$HIP_BASE_VERSION_PATCH = "22520";
+$HIP_BASE_VERSION_PATCH = "22740";
 
 #---
 # Function to parse config file

From 18c68c12359c8774bb61dcb84eeb49a1e061a43b Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 14 Oct 2022 09:53:27 +0000
Subject: [PATCH 18/38] SWDEV-1 - Bump patch version to 22800

Change-Id: I6f9cf17849a6df6ba41d6f91568b2892400824c2
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index bdf7b27988..fa0360d451 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "4";
-$HIP_BASE_VERSION_PATCH = "22740";
+$HIP_BASE_VERSION_PATCH = "22800";
 
 #---
 # Function to parse config file

From e3f6abe7f64315fb333a8e034078a65cdf45b78f Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 21 Oct 2022 10:13:12 +0000
Subject: [PATCH 19/38] SWDEV-1 - Bump patch version to 22870

Change-Id: I3ba9083bea97713407ccc2656e2bd66d6419f542
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index fa0360d451..9c945a365b 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "4";
-$HIP_BASE_VERSION_PATCH = "22800";
+$HIP_BASE_VERSION_PATCH = "22870";
 
 #---
 # Function to parse config file

From 66c3ba6e824bec776574936b53a9e0222554fa1a Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 9 Nov 2022 06:50:57 +0000
Subject: [PATCH 20/38] SWDEV-1 - Bump patch version to 22990

Change-Id: Ia64a235e43962c5570b8779e07cdaea8153a4298
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 9c945a365b..ce7c5febe0 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "4";
-$HIP_BASE_VERSION_PATCH = "22870";
+$HIP_BASE_VERSION_PATCH = "22990";
 
 #---
 # Function to parse config file

From 9fe0f94f06fe64e6789764bc5a98b91e5dd97d4c Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Mon, 21 Nov 2022 08:35:55 +0000
Subject: [PATCH 21/38] SWDEV-1 - Bump patch version to 23180

Change-Id: Id8d70b96711a6a36c3e08e13a903ca73b747b8fc
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 2040f330cb..83da9ed3b6 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "22990";
+$HIP_BASE_VERSION_PATCH = "23180";
 
 #---
 # Function to parse config file

From 1c310c00d82ab5e0966029999db60308c79c7370 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 1 Dec 2022 08:44:14 +0000
Subject: [PATCH 22/38] SWDEV-1 - Bump patch version to 23210

Change-Id: I4464d950b409455a55b8778470dba88fdc03c881
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 83da9ed3b6..84636d1eab 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "23180";
+$HIP_BASE_VERSION_PATCH = "23210";
 
 #---
 # Function to parse config file

From b96b4973643ae0af5cf75e08666612c631f94191 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 13 Dec 2022 10:35:57 +0000
Subject: [PATCH 23/38] SWDEV-1 - Bump patch version to 23380

Change-Id: I7341b957c72bc8c3d417ed0ea43135d52e894f56
---
 bin/hipvars.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index 787b0cbd27..bc6a267835 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "23210";
+$HIP_BASE_VERSION_PATCH = "23380";
 
 #---
 # Function to parse config file

From d9459e6f54359fb37f3e1c3511471712523189f4 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 27 Jan 2023 09:09:37 +0000
Subject: [PATCH 24/38] SWDEV-1 - Bump patch version to 30200

Change-Id: I7a47fc57422c1703e17cb89dc222a25492a86e07
---
 VERSION        | 2 +-
 bin/hipvars.pm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index 0db853aa34..568627cd71 100644
--- a/VERSION
+++ b/VERSION
@@ -3,4 +3,4 @@
 #HIP_VERSION_MINOR
 5
 #HIP_VERSION_PATCH
-0
+30200
diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index bc6a267835..af02a930af 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "23380";
+$HIP_BASE_VERSION_PATCH = "30200";
 
 #---
 # Function to parse config file

From 8b48dbd3c0136b089bb2eef92fdfcafbc02eaead Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 3 Feb 2023 15:56:40 +0000
Subject: [PATCH 25/38] SWDEV-1 - Bump patch version to 30201

Change-Id: Ic1d2003bd60e8c7d0ed8d4bb06b16ff007288301
---
 VERSION        | 2 +-
 bin/hipvars.pm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index 568627cd71..9848f824d1 100644
--- a/VERSION
+++ b/VERSION
@@ -3,4 +3,4 @@
 #HIP_VERSION_MINOR
 5
 #HIP_VERSION_PATCH
-30200
+30201
diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index af02a930af..c0c2eaa3f8 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "30200";
+$HIP_BASE_VERSION_PATCH = "30201";
 
 #---
 # Function to parse config file

From 916d61802ed1598e0461a9afb430b8eae1ecf90f Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Wed, 11 Jan 2023 17:33:33 -0500
Subject: [PATCH 26/38] SWDEV-377267 - Update hip_build document for ROCm 5.5

Change-Id: I9fb1527572487c8fa97c7a7c289ec6081f17305d
---
 docs/markdown/hip_build.md | 52 ++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/docs/markdown/hip_build.md b/docs/markdown/hip_build.md
index 54d85fa540..4bab7ee8e1 100755
--- a/docs/markdown/hip_build.md
+++ b/docs/markdown/hip_build.md
@@ -38,15 +38,15 @@ Install Nvidia driver and pre-build packages (see HIP Installation Guide at http
 
 ## Branch of repository
 
-Before get HIP source code, set the expected branch of repository at the variable HIP_BRANCH.
+Before get HIP source code, set the expected branch of repository at the variable ROCM_BRANCH.
 For example, for ROCm5.0 release branch, set
 ```
-export HIP_BRANCH=rocm-5.0.x
+export ROCM_BRANCH=rocm-5.0.x
 ```
 
-ROCm5.1 release branch, set
+ROCm5.4 release branch, set
 ```
-export HIP_BRANCH=rocm-5.1.x
+export ROCM_BRANCH=rocm-5.4.x
 ```
 Similiar format for future branches.
 
@@ -59,9 +59,10 @@ ROCM_PATH is path where ROCM is installed. BY default ROCM_PATH is at /opt/rocm.
 ## Get HIP source code
 
 ```
-git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hipamd.git
-git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hip.git
-git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/ROCclr.git
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hipamd.git
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip.git
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/ROCclr.git
+git clone -b "$ROCM_BRANCH" https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git
 ```
 
 ## Set the environment variables
@@ -136,51 +137,54 @@ Please note, the integrated HIP directed tests, will be deprecated in future rel
 
 ### Build HIP catch tests
 
-After build and install HIP commands, catch tests can be built via the following instructions,
+HIP catch tests, with new architectured Catch2, are official seperated from HIP project, exist in HIP tests repository, can be built via the following instructions.
+
+#### Get HIP tests source code
+
+```
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip-tests.git
+```
+#### Build HIP tests from source
 
 ```
-cd "$HIP_DIR"
+export HIP_TESTS_DIR="$(readlink -f hip-tests)"
+cd "$HIP_TESTS_DIR"
 mkdir -p build; cd build
-export HIP_PATH=$HIPAMD_DIR/build/install
-cmake  ../tests/catch/ -DHIP_PLATFORM=amd
+export HIP_PATH=$HIPAMD_DIR/build/install (or any path where HIP is installed, for example, /opt/rocm)
+cmake ../catch/ -DHIP_PLATFORM=amd
 make -j$(nproc) build_tests
 ctest # run tests
 ```
+HIP catch tests are built under the folder $HIP_TESTS_DIR/build.
 
-HIP catch tests are built under the folder $HIP_DIR/build.
-
-To run a single catch test, the following is an example,
+To run any single catch test, the following is an example,
 
 ```
-cd $HIP_DIR/build/unit/texture
+cd $HIP_TESTS_DIR/build/catch_tests/unit/texture
 ./TextureTest
 ```
 
-### Build HIP Catch2 standalone test
+#### Build HIP Catch2 standalone test
 
 HIP Catch2 supports build a standalone test, for example,
 
 ```
-export PATH=$HIP_DIR/bin:$PATH
-export HIP_PATH=$HIPAMD_DIR/build/install
-
-hipcc $HIP_DIR/tests/catch/unit/memory/hipPointerGetAttributes.cc -I ./tests/catch/include ./tests/catch/hipTestMain/standalone_main.cc -I ./tests/catch/external/Catch2 -g -o hipPointerGetAttributes
+cd "$HIP_TESTS_DIR"
+hipcc $HIP_TESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc -I ./catch/include ./catch/hipTestMain/standalone_main.cc -I ./catch/external/Catch2 -o hipPointerGetAttributes
 ./hipPointerGetAttributes
 ...
 
 All tests passed
 ```
 
-HIP catch tests, especially new architectured Catch2, will be official HIP tests in the repository and can be built alone as with the instructions shown above.
-
 # Build HIP on NVIDIA platform
 
 
 ## Get HIP source code
 
 ```
-git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hip.git
-git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hipamd.git
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip.git
+git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hipamd.git
 ```
 
 ## Set the environment variables

From 471ddbf1375c6eb17eb7abf3ef8d7e70e40ba2f9 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Fri, 27 Jan 2023 17:16:02 -0500
Subject: [PATCH 27/38] SWDEV-379789 - Correct virtual function support
 information

Change-Id: Ife63b2f4bb3ecdb7fed3b796bc3cd582c1b5ad20
---
 docs/markdown/hip_kernel_language.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index 2b4f7e9cc8..5f8e9f8f95 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -862,8 +862,9 @@ Output Constraints are specified by an `"="` prefix as shown above ("=v"). This
 ## C++ Support
 The following C++ features are not supported:
 - Run-time-type information (RTTI)
-- Virtual functions
 - Try/catch
+- Virtual functions
+Virtual functions are not supported if objects containing virtual function tables are passed between GPU's of different offload arch's, e.g. between gfx906 and gfx1030. Otherwise virtual functions are supported.
 
 ## Kernel Compilation
 hipcc now supports compiling C++/HIP kernels to binary code objects.

From abfaf1dcb2df8e3df98d21ecdf3e6d7099e66d30 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Mon, 9 Jan 2023 15:10:34 -0500
Subject: [PATCH 28/38] SWDEV-376665 - Correct information for hipStreamDestroy
 API

Change-Id: If1c255880569dbd81a058683185065502a10b782
---
 include/hip/hip_runtime_api.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index e8f82ed937..8519a4a367 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -2116,8 +2116,7 @@ hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPrio
 /**
  * @brief Destroys the specified stream.
  *
- * @param[in, out] stream Valid pointer to hipStream_t.  This function writes the memory with the
- * newly created stream.
+ * @param[in] stream stream identifier.
  * @return #hipSuccess #hipErrorInvalidHandle
  *
  * Destroys the specified stream.

From 2525892ec6d8a450f37bbf4cf6d906bec1fcb149 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Thu, 15 Dec 2022 13:19:54 -0500
Subject: [PATCH 29/38] SWDEV-368819 - Add information on HIP APIs

Change-Id: I0677280d14c3b813b4288682360824a20995eaf1
---
 include/hip/hip_runtime_api.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 8519a4a367..a471b9ca10 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -2580,12 +2580,16 @@ hipError_t hipEventQuery(hipEvent_t event);
 hipError_t hipPointerSetAttribute(const void* value, hipPointer_attribute attribute,
                                   hipDeviceptr_t ptr);
 
+
 /**
  *  @brief Return attributes for the specified pointer
  *
  *  @param [out]  attributes  attributes for the specified pointer
  *  @param [in]   ptr         pointer to get attributes for
  *
+ *  Note: To get pointer's memory type, the parameter attributes has 'type' as member variable.
+ *  The 'type' indicates input pointer is allocated on device or host.
+ *
  *  @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
  *
  *  @see hipPointerGetAttribute
@@ -2772,10 +2776,11 @@ hipError_t hipMemAllocHost(void** ptr, size_t size);
  *  @brief Allocate device accessible page locked host memory
  *
  *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
+ *  @param[in]  size Requested memory size in bytes
  *  @param[in]  flags Type of host memory allocation
  *
  *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *  If no input for flags, it will be the default pinned memory allocation on the host.
  *
  *  @return #hipSuccess, #hipErrorOutOfMemory
  *
@@ -3290,7 +3295,7 @@ hipError_t hipMemPoolImportPointer(
  *  @brief Allocate device accessible page locked host memory [Deprecated]
  *
  *  @param[out] ptr Pointer to the allocated host pinned memory
- *  @param[in]  size Requested memory size
+ *  @param[in]  size Requested memory size in bytes
  *  @param[in]  flags Type of host memory allocation
  *
  *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.

From b928ee053b2a1bb40865c1c0229930264bb53a23 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Tue, 7 Feb 2023 21:02:58 -0500
Subject: [PATCH 30/38] SWDEV-336460 - Update description for scratch
 allocation API

Change-Id: I95479cbebdbf41145b039b31caf4c6ddadb94bf5
---
 include/hip/hip_runtime_api.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index a471b9ca10..63bacebb7b 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -1695,21 +1695,25 @@ hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
  */
 hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig);
 /**
- * @brief Get Resource limits of current device
+ * @brief Gets resource limits of current device
+ * The funtion querys the size of limit value, as required input enum hipLimit_t, can be either
+ * hipLimitStackSize, or hipLimitMallocHeapSize.
  *
- * @param [out] pValue
- * @param [in]  limit
+ * @param [out] pValue returns the size of the limit in bytes
+ * @param [in]  limit the limit to query
  *
  * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
- * Note: Currently, only hipLimitMallocHeapSize is available
  *
  */
 hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
 /**
- * @brief Set Resource limits of current device
- *
- * @param [in] limit
- * @param [in] value
+ * @brief Sets resource limits of current device
+ * As the input enum limit, hipLimitStackSize sets the limit value of the stack size on current
+ * GPU devie, hipLimitMallocHeapSize sets the limit value of the heap used by the malloc()/free()
+ * calls. 
+ * 
+ * @param [in] limit enum of hipLimit_t to set
+ * @param [in] value the size of limit value in bytes
  *
  * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
  *

From 38d2451ee970ed3b398bf39a5286fc21318253f8 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Thu, 19 Jan 2023 21:11:38 -0500
Subject: [PATCH 31/38] SWDEV-306306 - Update information for hipMemGetInfo API

Change-Id: Ib2b7f81189a82fd47f28cd0056f2997d4ead0e34
---
 include/hip/hip_runtime_api.h | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 63bacebb7b..51641c4958 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -3818,7 +3818,7 @@ hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
  *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
  * byte value value.
  *
- *  hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
+ * hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
  * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
  * stream argument. If stream is non-zero, the operation may overlap with operations in other
  * streams.
@@ -3891,14 +3891,22 @@ hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent
 hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ,hipStream_t stream __dparm(0));
 /**
  * @brief Query memory info.
- * Return snapshot of free memory, and total allocatable memory on the device.
  *
- * Returns in *free a snapshot of the current free memory.
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
- * @warning On HCC, the free memory only accounts for memory allocated by this process and may be
- *optimistic.
+ * On ROCM, this function gets the actual free memory left on the current device, so supports
+ * the cases while running multi-workload (such as multiple processes, multiple threads, and
+ * multiple GPUs).
+ *
+ * @warning On Windows, the free memory only accounts for memory allocated by this process and may
+ * be optimistic.
+ *
+ * @param[out] free returns free memory on the current device in bytes
+ * @param[out] total returns total allocatable memory on the current device in bytes
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
  **/
 hipError_t hipMemGetInfo(size_t* free, size_t* total);
+
 hipError_t hipMemPtrGetInfo(void* ptr, size_t* size);
 /**
  *  @brief Allocate an array on the device.

From 67ce04904091e3fbc043a889cdd0bda90a464b5e Mon Sep 17 00:00:00 2001
From: AravindanC <aravindan.cheruvally@amd.com>
Date: Thu, 5 Jan 2023 10:21:38 -0800
Subject: [PATCH 32/38] SWDEV-356879 - find_dependency instead of hsa path
 search

Change-Id: I728c11146eb355b72ec0e01b20832787b93afb5b
---
 hip-lang-config.cmake.in | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/hip-lang-config.cmake.in b/hip-lang-config.cmake.in
index 46050ca1bd..cee1b0e76b 100644
--- a/hip-lang-config.cmake.in
+++ b/hip-lang-config.cmake.in
@@ -72,23 +72,6 @@ get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" REALPATH)
 get_filename_component(_IMPORT_PREFIX "${_DIR}/../../../../" ABSOLUTE)
 
 
-#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
-if( DEFINED ENV{ROCM_PATH} )
-  set(ROCM_PATH "$ENV{ROCM_PATH}")
-endif()
-
-#if HSA is not under ROCm then provide CMAKE_PREFIX_PATH=<HSA_PATH>
-find_path(HSA_HEADER hsa/hsa.h
-  PATHS
-    "${_IMPORT_PREFIX}/include"
-    "${ROCM_PATH}/include"
-)
-
-if (NOT HSA_HEADER)
-  message (FATAL_ERROR "HSA header not found! ROCM_PATH environment not set")
-endif()
-
-
 set_target_properties(hip-lang::device PROPERTIES
   INTERFACE_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include>"
   INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include>"
@@ -96,8 +79,8 @@ set_target_properties(hip-lang::device PROPERTIES
 
 set_target_properties(hip-lang::amdhip64 PROPERTIES
   INTERFACE_COMPILE_DEFINITIONS "$<$<COMPILE_LANGUAGE:HIP>:__HIP_ROCclr__=1>"
-  INTERFACE_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include;${HSA_HEADER}>"
-  INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include;${HSA_HEADER}>"
+  INTERFACE_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include>"
+  INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include>"
 )
 set_target_properties(hip-lang::device PROPERTIES
   INTERFACE_COMPILE_DEFINITIONS "$<$<COMPILE_LANGUAGE:HIP>:__HIP_ROCclr__=1>"

From c96d5a0b75d0d3cd5c9bd5add803bfd40e3b2806 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Tue, 24 Jan 2023 06:42:52 +0000
Subject: [PATCH 33/38] SWDEV-355608 - deprecate/cleanup hipcc link flags
 (#3128)

- deprecate -use-staticlib, -use-sharedlib which no longer provide any
functional values
- use --hip-link instead of specifying the HIP runtime by name when
linking
- fix linker option bug in HIT test's cmake
- update build options for unit tests requiring pthread or rt

Change-Id: Ib49978773c80fb40c71dc52b050ce921943ee3e4
---
 bin/hipcc.pl                                  | 43 ++++++-------------
 samples/0_Intro/square/README.md              |  2 +-
 tests/catch/multiproc/CMakeLists.txt          |  6 ++-
 tests/catch/unit/device/CMakeLists.txt        |  7 ++-
 tests/catch/unit/deviceLib/CMakeLists.txt     |  3 +-
 tests/catch/unit/errorHandling/CMakeLists.txt |  7 ++-
 tests/catch/unit/event/CMakeLists.txt         |  4 ++
 tests/catch/unit/graph/CMakeLists.txt         |  7 ++-
 tests/catch/unit/memory/CMakeLists.txt        |  5 ++-
 tests/catch/unit/multiThread/CMakeLists.txt   |  7 ++-
 tests/catch/unit/stream/CMakeLists.txt        |  7 ++-
 .../catch/unit/streamperthread/CMakeLists.txt |  7 ++-
 tests/hit/HIT.cmake                           |  2 +-
 .../src/deviceLib/hip_threadfence_system.cpp  |  2 +-
 tests/src/hipEnvVarDriver.cpp                 |  2 +-
 tests/src/ipc/hipMultiProcIpcEvent.cpp        |  2 +-
 tests/src/ipc/hipMultiProcIpcMem.cpp          |  2 +-
 .../event/hipEventMultiThreaded.cpp           |  2 +-
 .../runtimeApi/memory/hipIpcMemAccessTest.cpp |  2 +-
 .../memory/hipMallocConcurrency.cpp           |  2 +-
 tests/src/runtimeApi/memory/hipMemcpy.cpp     |  2 +-
 .../memory/hipMemcpyNegativeMThrdMSize.cpp    |  2 +-
 .../memory/hipMemcpyWithStreamMultiThread.cpp |  2 +-
 .../hipMemset2DAsyncMultiThreadAndKernel.cpp  |  2 +-
 .../memory/hipMemset3DRegressMultiThread.cpp  |  2 +-
 .../memory/hipMemsetAsyncMultiThread.cpp      |  2 +-
 .../hipMultiMemcpyMultiThrdMultiStrm.cpp      |  2 +-
 .../memory/hipMultiMemcpyMultiThread.cpp      |  2 +-
 .../memory/hipPointerAttributes.cpp           |  2 +-
 .../hipModuleLoadDataMultThreadOnMultGPU.cpp  |  2 +-
 .../module/hipModuleLoadDataMultThreaded.cpp  |  2 +-
 .../module/hipModuleLoadMultiThreaded.cpp     |  2 +-
 .../module/hipModuleTexture2dDrv.cpp          |  2 +-
 .../multiThread/hipMultiThreadDevice.cpp      |  2 +-
 .../multiThread/hipMultiThreadStreams1.cpp    |  2 +-
 .../multiThread/hipMultiThreadStreams2.cpp    |  2 +-
 .../p2p/hipP2pLinkTypeAndHopFunc.cpp          |  2 +-
 .../stream/hipStreamACb_MStrm_Mgpu.cpp        |  2 +-
 .../stream/hipStreamACb_MultiThread.cpp       |  2 +-
 .../stream/hipStreamAddCallbackCatch.cpp      |  2 +-
 .../streamOperations/hipstream_operations.cpp |  2 +-
 .../cache_coherency_cpu_gpu.cpp               |  2 +-
 42 files changed, 93 insertions(+), 72 deletions(-)

diff --git a/bin/hipcc.pl b/bin/hipcc.pl
index 431701fb50..6898cd0bba 100644
--- a/bin/hipcc.pl
+++ b/bin/hipcc.pl
@@ -199,10 +199,6 @@ BEGIN
         print ("HIP_CLANG_RT_LIB=$HIP_CLANG_RT_LIB\n");
     }
 
-    $HIPLDFLAGS .= " -L\"$HIP_LIB_PATH\"";
-    if ($isWindows) {
-      $HIPLDFLAGS .= " -lamdhip64";
-    }
     if ($HIP_CLANG_HCC_COMPAT_MODE) {
         ## Allow __fp16 as function parameter and return type.
         $HIPCXXFLAGS .= " -Xclang -fallow-half-arguments-and-returns -D__HIP_HCC_COMPAT_MODE__=1";
@@ -245,8 +241,6 @@ BEGIN
 my $printLDFlags = 0;       # print HIPLDFLAGS
 my $runCmd = 1;
 my $buildDeps = 0;
-my $linkType = 1;
-my $setLinkType = 0;
 my $hsacoVersion = 0;
 my $funcSupp = 0;      # enable function support
 my $rdc = 0;           # whether -fgpu-rdc is on
@@ -361,16 +355,11 @@ BEGIN
         $compileOnly = 1;
         $buildDeps = 1;
     }
-    if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
-    {
-        $linkType = 0;
-        $setLinkType = 1;
-        $swallowArg = 1;
+    if($trimarg eq '-use-staticlib') {
+        print "Warning: The -use-staticlib option has been deprecated and is no longer needed.\n"
     }
-    if(($trimarg eq '-use-sharedlib') and ($setLinkType eq 0))
-    {
-        $linkType = 1;
-        $setLinkType = 1;
+    if($trimarg eq '-use-sharedlib') {
+        print "Warning: The -use-sharedlib option has been deprecated and is no longer needed.\n"
     }
     if($arg =~ m/^-O/)
     {
@@ -558,12 +547,6 @@ BEGIN
     $HIPCXXFLAGS .= " --cuda-host-only";
 }
 
-# Add --hip-link only if it is compile only and -fgpu-rdc is on.
-if ($rdc and !$compileOnly and $HIP_PLATFORM eq 'amd') {
-    $HIPLDFLAGS .= " --hip-link";
-    $HIPLDFLAGS .= $HIPLDARCHFLAGS;
-}
-
 # hipcc currrently requires separate compilation of source files, ie it is not possible to pass
 # CPP files combined with .O files
 # Reason is that NVCC uses the file extension to determine whether to compile in CUDA mode or
@@ -588,18 +571,16 @@ BEGIN
             $HIPCXXFLAGS .= " --hip-device-lib-path=\"$DEVICE_LIB_PATH\"";
         }
     }
-    if (not $isWindows) {
-        $HIPLDFLAGS .= " -lgcc_s -lgcc -lpthread -lm -lrt";
-    }
 
-    if (not $isWindows  and not $compileOnly) {
-      if ($linkType eq 0) {
-        $toolArgs = " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma " . ${toolArgs};
-      } else {
-        $toolArgs = ${toolArgs} . " -Wl,-rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 ";
-      }
+    if (!$compileOnly) {
+        $HIPLDFLAGS .= " --hip-link";
+        if ($rdc) {
+            $HIPLDFLAGS .= $HIPLDARCHFLAGS;
+        }
+        if (not $isWindows) {
+            $HIPLDFLAGS .= " --rtlib=compiler-rt -unwindlib=libgcc";
 
-      $toolArgs .= " -L$HIP_CLANG_RT_LIB -lclang_rt.builtins-x86_64 "
+        }
     }
 }
 
diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md
index 807f08754e..e06fbceda1 100644
--- a/samples/0_Intro/square/README.md
+++ b/samples/0_Intro/square/README.md
@@ -21,7 +21,7 @@ $ cd ~/hip/samples/0_Intro/square
 $ make
 /opt/rocm/hip/bin/hipify-perl square.cu > square.cpp
 /opt/rocm/hip/bin/hipcc  square.cpp -o square.out
-/opt/rocm/hip/bin/hipcc -use-staticlib  square.cpp -o square.out.static
+/opt/rocm/hip/bin/hipcc  square.cpp -o square.out
 ```
 - Execute file
 ```
diff --git a/tests/catch/multiproc/CMakeLists.txt b/tests/catch/multiproc/CMakeLists.txt
index 5485ee9ca5..58bec6f70f 100644
--- a/tests/catch/multiproc/CMakeLists.txt
+++ b/tests/catch/multiproc/CMakeLists.txt
@@ -19,6 +19,9 @@ set(LINUX_TEST_SRC
 )
 
 add_custom_target(dummy_kernel.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${CMAKE_CURRENT_SOURCE_DIR}/dummy_kernel.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/../multiproc/dummy_kernel.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
 
 # the last argument linker libraries is required for this test but optional to the function
 if(HIP_PLATFORM MATCHES "nvidia")
@@ -30,7 +33,6 @@ elseif(HIP_PLATFORM MATCHES "amd")
 hip_add_exe_to_target(NAME MultiProc
                       TEST_SRC ${LINUX_TEST_SRC}
                       TEST_TARGET_NAME build_tests
-                      LINKER_LIBS ${CMAKE_DL_LIBS})
+                      LINKER_LIBS ${CMAKE_DL_LIBS} ${__LINKER_LIBS__})
 endif()
 add_dependencies(build_tests dummy_kernel.code)
-
diff --git a/tests/catch/unit/device/CMakeLists.txt b/tests/catch/unit/device/CMakeLists.txt
index 1e26944d0f..b9fe3726f2 100644
--- a/tests/catch/unit/device/CMakeLists.txt
+++ b/tests/catch/unit/device/CMakeLists.txt
@@ -43,10 +43,15 @@ set_source_files_properties(hipDeviceGetP2PAttribute.cc PROPERTIES COMPILE_FLAGS
 add_executable(getDeviceCount EXCLUDE_FROM_ALL getDeviceCount_exe.cc)
 add_executable(hipDeviceGetP2PAttribute EXCLUDE_FROM_ALL hipDeviceGetP2PAttribute_exe.cc)
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME DeviceTest
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++14)
+                      COMPILE_OPTIONS -std=c++14
+                      LINKER_LIBS ${__LINKER_LIBS__})
 
 add_dependencies(DeviceTest getDeviceCount)
 add_dependencies(DeviceTest hipDeviceGetP2PAttribute)
diff --git a/tests/catch/unit/deviceLib/CMakeLists.txt b/tests/catch/unit/deviceLib/CMakeLists.txt
index c54cd68c08..3d882a6644 100644
--- a/tests/catch/unit/deviceLib/CMakeLists.txt
+++ b/tests/catch/unit/deviceLib/CMakeLists.txt
@@ -18,6 +18,7 @@ set(TEST_SRC
 if(UNIX)
     set(TEST_SRC ${TEST_SRC}
         deviceAllocation.cc)
+    set(__LINKER_LIBS__ pthread)
 endif()
 
 # AMD only tests
@@ -86,7 +87,7 @@ endif()
     hip_add_exe_to_target(NAME UnitDeviceTests
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests
-                      LINKER_LIBS hiprtc)
+                      LINKER_LIBS hiprtc ${__LINKER_LIBS__})
 elseif(HIP_PLATFORM MATCHES "nvidia")
     hip_add_exe_to_target(NAME UnitDeviceTests
                       TEST_SRC ${TEST_SRC}
diff --git a/tests/catch/unit/errorHandling/CMakeLists.txt b/tests/catch/unit/errorHandling/CMakeLists.txt
index 78933fd3f8..1b89eb066e 100644
--- a/tests/catch/unit/errorHandling/CMakeLists.txt
+++ b/tests/catch/unit/errorHandling/CMakeLists.txt
@@ -6,7 +6,12 @@ set(TEST_SRC
     hipPeekAtLastError.cc
 )
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME ErrorHandlingTest
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++14)
\ No newline at end of file
+		      LINKER_LIBS ${__LINKER_LIBS__}
+		      COMPILE_OPTIONS -std=c++14)
diff --git a/tests/catch/unit/event/CMakeLists.txt b/tests/catch/unit/event/CMakeLists.txt
index b4d5f2c3b6..42b8699ba3 100644
--- a/tests/catch/unit/event/CMakeLists.txt
+++ b/tests/catch/unit/event/CMakeLists.txt
@@ -20,6 +20,10 @@ if(HIP_PLATFORM MATCHES "amd")
   set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
 endif()
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME EventTest
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests)
diff --git a/tests/catch/unit/graph/CMakeLists.txt b/tests/catch/unit/graph/CMakeLists.txt
index e2b66872da..a28810a7e8 100644
--- a/tests/catch/unit/graph/CMakeLists.txt
+++ b/tests/catch/unit/graph/CMakeLists.txt
@@ -83,6 +83,11 @@ set(TEST_SRC
   hipUserObjectCreate.cc
 )
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME GraphsTest
                       TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests)
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS ${__LINKER_LIBS__})
diff --git a/tests/catch/unit/memory/CMakeLists.txt b/tests/catch/unit/memory/CMakeLists.txt
index 1b4cef3698..74165c5db0 100644
--- a/tests/catch/unit/memory/CMakeLists.txt
+++ b/tests/catch/unit/memory/CMakeLists.txt
@@ -182,8 +182,11 @@ if(UNIX)
     set(TEST_SRC ${TEST_SRC}
                  hipHmmOvrSubscriptionTst.cc
                  hipMemoryAllocateCoherent.cc)
+                 
+    set(__LINKER_LIBS__ pthread)
 endif()
 
 hip_add_exe_to_target(NAME MemoryTest
                       TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests)
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS ${__LINKER_LIBS__})
diff --git a/tests/catch/unit/multiThread/CMakeLists.txt b/tests/catch/unit/multiThread/CMakeLists.txt
index 32abf0f5f3..515832ef95 100644
--- a/tests/catch/unit/multiThread/CMakeLists.txt
+++ b/tests/catch/unit/multiThread/CMakeLists.txt
@@ -5,6 +5,11 @@ set(TEST_SRC
   hipMultiThreadStreams2.cc
 )
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME MultiThreadTest
                       TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests)
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS ${__LINKER_LIBS__})
diff --git a/tests/catch/unit/stream/CMakeLists.txt b/tests/catch/unit/stream/CMakeLists.txt
index 548bb2e52c..0a8d0b1a3b 100644
--- a/tests/catch/unit/stream/CMakeLists.txt
+++ b/tests/catch/unit/stream/CMakeLists.txt
@@ -45,7 +45,12 @@ set(TEST_SRC
     # set_source_files_properties(hipStreamAttachMemAsync.cc PROPERTIES COMPILE_FLAGS -std=c++17)
 endif()
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME StreamTest
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++17)
+                      COMPILE_OPTIONS -std=c++17
+                      LINKER_LIBS ${__LINKER_LIBS__})
diff --git a/tests/catch/unit/streamperthread/CMakeLists.txt b/tests/catch/unit/streamperthread/CMakeLists.txt
index dd5b54dabc..51e2255fa3 100644
--- a/tests/catch/unit/streamperthread/CMakeLists.txt
+++ b/tests/catch/unit/streamperthread/CMakeLists.txt
@@ -7,6 +7,11 @@ set(TEST_SRC
   hipStreamPerThrdTsts.cc
 )
 
+if (UNIX)
+  set(__LINKER_LIBS__ pthread)
+endif()
+
 hip_add_exe_to_target(NAME StreamPerThreadTest
                       TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests)
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS ${__LINKER_LIBS__})
diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake
index 68c3df3c4f..e1c8fcbc4b 100755
--- a/tests/hit/HIT.cmake
+++ b/tests/hit/HIT.cmake
@@ -354,7 +354,7 @@ macro(HIT_ADD_FILES _config _dir _label _parent)
                 set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
                 hip_reset_flags()
                 hip_add_executable(${target} ${_sources} HIPCC_OPTIONS ${_hipcc_options} CLANG_OPTIONS ${_clang_options} NVCC_OPTIONS ${_nvcc_options} EXCLUDE_FROM_ALL)
-                target_link_libraries(${target} PRIVATE ${_link_options})
+                target_link_options(${target} PRIVATE ${_link_options})
                 set_target_properties(${target} PROPERTIES OUTPUT_NAME ${_target} RUNTIME_OUTPUT_DIRECTORY ${_label} LINK_DEPENDS "${HIP_LIB_FILES}")
                 add_dependencies(${_parent} ${target})
                 foreach(_dependency ${_depends})
diff --git a/tests/src/deviceLib/hip_threadfence_system.cpp b/tests/src/deviceLib/hip_threadfence_system.cpp
index 097dae2273..11e84a3b87 100644
--- a/tests/src/deviceLib/hip_threadfence_system.cpp
+++ b/tests/src/deviceLib/hip_threadfence_system.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/hipEnvVarDriver.cpp b/tests/src/hipEnvVarDriver.cpp
index 2d303e08fe..bd5de426d6 100644
--- a/tests/src/hipEnvVarDriver.cpp
+++ b/tests/src/hipEnvVarDriver.cpp
@@ -16,7 +16,7 @@ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 /* HIT_START
- * BUILD: %t %s test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s test_common.cpp LINK_OPTIONS -lpthread NVCC_OPTIONS -std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/ipc/hipMultiProcIpcEvent.cpp b/tests/src/ipc/hipMultiProcIpcEvent.cpp
index 418b68a5ed..b06c010a01 100644
--- a/tests/src/ipc/hipMultiProcIpcEvent.cpp
+++ b/tests/src/ipc/hipMultiProcIpcEvent.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp LINK_OPTIONS -lrt
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/ipc/hipMultiProcIpcMem.cpp b/tests/src/ipc/hipMultiProcIpcMem.cpp
index 3829853774..ec579dfd7b 100644
--- a/tests/src/ipc/hipMultiProcIpcMem.cpp
+++ b/tests/src/ipc/hipMultiProcIpcMem.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp LINK_OPTIONS -lrt
  * TEST: %t --N 4
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
index 65f2ff1851..2bf5a78969 100644
--- a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
+++ b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp b/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
index cd9809ded5..94142a76bd 100644
--- a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
+++ b/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp LINK_OPTIONS -lrt -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp b/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
index 4bfd65a7f9..44a4c5d086 100644
--- a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
+++ b/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
@@ -58,7 +58,7 @@ Testcase Scenarios :
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST_NAMED: %t hipMalloc_ArgValidation  --tests 1
  * TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2
  * TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3
diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp
index c6bb98be0c..6735616b0b 100644
--- a/tests/src/runtimeApi/memory/hipMemcpy.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST_NAMED: %t hipMemcpy-modes --tests 0x1
  * TEST_NAMED: %t hipMemcpy-size --tests 0x6
  * TEST_NAMED: %t hipMemcpy-dev-offsets --tests 0x10
diff --git a/tests/src/runtimeApi/memory/hipMemcpyNegativeMThrdMSize.cpp b/tests/src/runtimeApi/memory/hipMemcpyNegativeMThrdMSize.cpp
index 3f103915ee..53dae66758 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyNegativeMThrdMSize.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyNegativeMThrdMSize.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
 // of 8 hipmemcpy apis
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST_NAMED: %t hipMemcpyNegativeMThrdMSize_Negative_tests  --tests 1
  * TEST_NAMED: %t hipMemcpyNegativeMThrdMSize_MultiThread_tests --tests 2
  * TEST_NAMED: %t hipMemcpyNegativeMThrdMSize_MultiSize_singleType --tests 3 --memcpyPeersOnly 0 --testAllTypes 0
diff --git a/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp b/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
index d5580a4f6d..ddd9c853ea 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMemset2DAsyncMultiThreadAndKernel.cpp b/tests/src/runtimeApi/memory/hipMemset2DAsyncMultiThreadAndKernel.cpp
index aed1efbba8..09057677ac 100644
--- a/tests/src/runtimeApi/memory/hipMemset2DAsyncMultiThreadAndKernel.cpp
+++ b/tests/src/runtimeApi/memory/hipMemset2DAsyncMultiThreadAndKernel.cpp
@@ -24,7 +24,7 @@
 //
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMemset3DRegressMultiThread.cpp b/tests/src/runtimeApi/memory/hipMemset3DRegressMultiThread.cpp
index 80d92e9355..7ae81ad695 100644
--- a/tests/src/runtimeApi/memory/hipMemset3DRegressMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMemset3DRegressMultiThread.cpp
@@ -34,7 +34,7 @@ Testcase Scenarios :
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t --tests 1
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMemsetAsyncMultiThread.cpp b/tests/src/runtimeApi/memory/hipMemsetAsyncMultiThread.cpp
index a7893d7bd7..7f6d379dc2 100644
--- a/tests/src/runtimeApi/memory/hipMemsetAsyncMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMemsetAsyncMultiThread.cpp
@@ -22,7 +22,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
index 7388f44b9f..a1eebf20b1 100644
--- a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
+++ b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api on the same stream. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
index 2a690b9832..ea8f32c965 100644
--- a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipPointerAttributes.cpp b/tests/src/runtimeApi/memory/hipPointerAttributes.cpp
index 9666dd4693..a85dc156da 100644
--- a/tests/src/runtimeApi/memory/hipPointerAttributes.cpp
+++ b/tests/src/runtimeApi/memory/hipPointerAttributes.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
index f8b436068b..dadcc2d222 100644
--- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
index 537195c5fb..4b03c3b189 100644
--- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvidia LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipModuleLoadMultiThreaded.cpp b/tests/src/runtimeApi/module/hipModuleLoadMultiThreaded.cpp
index dbd41900ce..24f0f782cf 100644
--- a/tests/src/runtimeApi/module/hipModuleLoadMultiThreaded.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLoadMultiThreaded.cpp
@@ -19,7 +19,7 @@ THE SOFTWARE.
 
 /* HIT_START
  * BUILD_CMD: empty_kernel.code %hc --genco %S/empty_kernel.cpp -o empty_kernel.code
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipModuleTexture2dDrv.cpp b/tests/src/runtimeApi/module/hipModuleTexture2dDrv.cpp
index 67c62f412e..94e753a585 100644
--- a/tests/src/runtimeApi/module/hipModuleTexture2dDrv.cpp
+++ b/tests/src/runtimeApi/module/hipModuleTexture2dDrv.cpp
@@ -23,7 +23,7 @@ THE SOFTWARE.
 /* Tests 6 and 7 are skipped for CUDA 11.2 due to cuda runtime issues */
 /* HIT_START
  * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t --tests 0x01
  * TEST: %t --tests 0x02
  * TEST: %t --tests 0x03
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
index af48d5a367..18bf9c0bd2 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
@@ -1,5 +1,5 @@
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST_NAMED: %t hipMultiThreadDevice-serial --tests 0x1
  * TEST_NAMED: %t hipMultiThreadDevice-pyramid --tests 0x4
  * TEST_NAMED: %t hipMultiThreadDevice-nearzero --tests 0x10
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
index f1c683927b..b8fca4d02f 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
index 2f2367775f..752ffd870d 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/p2p/hipP2pLinkTypeAndHopFunc.cpp b/tests/src/runtimeApi/p2p/hipP2pLinkTypeAndHopFunc.cpp
index 3cd5d915b5..6786124095 100644
--- a/tests/src/runtimeApi/p2p/hipP2pLinkTypeAndHopFunc.cpp
+++ b/tests/src/runtimeApi/p2p/hipP2pLinkTypeAndHopFunc.cpp
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD_CMD: %t %hc %S/%s -o %T/%t %S/../../test_common.cpp -I %S/../../ -L%rocm-path/rocm_smi/lib -lrocm_smi64 -ldl EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD_CMD: %t %hc %S/%s -o %T/%t %S/../../test_common.cpp -I %S/../../ -L%rocm-path/rocm_smi/lib -lrocm_smi64 -lpthread -ldl EXCLUDE_HIP_PLATFORM nvidia
  * TEST: %t --tests 0x1
  * TEST: %t --tests 0x2
  * TEST: %t --tests 0x3
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
index 35f0e15b11..d56ce2b3c0 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
@@ -21,7 +21,7 @@
 // kernel. Verify that all the kernels queued are executed before the callback.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
index d26e41019c..704621761e 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 // This test case is disabled currently.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp b/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp
index c694bf6d2a..59d05204ee 100644
--- a/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp
@@ -11,7 +11,7 @@
 #include "test_common.h"
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/streamOperations/hipstream_operations.cpp b/tests/src/runtimeApi/streamOperations/hipstream_operations.cpp
index cd4d9fd425..8cd3b7bec2 100644
--- a/tests/src/runtimeApi/streamOperations/hipstream_operations.cpp
+++ b/tests/src/runtimeApi/streamOperations/hipstream_operations.cpp
@@ -39,7 +39,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvidia LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/synchronization/cache_coherency_cpu_gpu.cpp b/tests/src/runtimeApi/synchronization/cache_coherency_cpu_gpu.cpp
index 3e4aec033c..dae95f1264 100644
--- a/tests/src/runtimeApi/synchronization/cache_coherency_cpu_gpu.cpp
+++ b/tests/src/runtimeApi/synchronization/cache_coherency_cpu_gpu.cpp
@@ -19,7 +19,7 @@ THE SOFTWARE.
 // Simple test for Fine Grained CPU-GPU coherency.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp HIPCC_OPTIONS -std=c++11 -lpthread EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia LINK_OPTIONS -lpthread
  * TEST: %t
  * HIT_END
  */

From 1dabde978f4ee7508ad91da133777a037261fbf4 Mon Sep 17 00:00:00 2001
From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com>
Date: Wed, 15 Feb 2023 05:18:12 +0530
Subject: [PATCH 34/38] SWDEV-355543 - add descriptions in hip API parameters
 (#3163)

Change-Id: I35b9ee55d0d743e0d009e5aad221c2b8a61fc732
---
 include/hip/hip_runtime_api.h | 76 +++++++++++++++++------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 51641c4958..62366a1b91 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -1396,7 +1396,7 @@ hipError_t hipInit(unsigned int flags);
 /**
  * @brief Returns the approximate HIP driver version.
  *
- * @param [out] driverVersion
+ * @param [out] driverVersion driver version
  *
  * @returns #hipSuccess, #hipErrorInvalidValue
  *
@@ -1412,7 +1412,7 @@ hipError_t hipDriverGetVersion(int* driverVersion);
 /**
  * @brief Returns the approximate HIP Runtime version.
  *
- * @param [out] runtimeVersion
+ * @param [out] runtimeVersion HIP runtime version
  *
  * @returns #hipSuccess, #hipErrorInvalidValue
  *
@@ -1426,8 +1426,8 @@ hipError_t hipDriverGetVersion(int* driverVersion);
 hipError_t hipRuntimeGetVersion(int* runtimeVersion);
 /**
  * @brief Returns a handle to a compute device
- * @param [out] device
- * @param [in] ordinal
+ * @param [out] device Handle of device
+ * @param [in] ordinal Device ordinal
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
@@ -1435,26 +1435,26 @@ hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
 
 /**
  * @brief Returns the compute capability of the device
- * @param [out] major
- * @param [out] minor
- * @param [in] device
+ * @param [out] major Major compute capability version number
+ * @param [out] minor Minor compute capability version number
+ * @param [in] device Device ordinal
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
 /**
  * @brief Returns an identifer string for the device.
- * @param [out] name
- * @param [in] len
- * @param [in] device
+ * @param [out] name String of the device name
+ * @param [in] len Maximum length of string to store in device name
+ * @param [in] device Device ordinal
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
 /**
  * @brief Returns an UUID for the device.[BETA]
- * @param [out] uuid
- * @param [in] device
+ * @param [out] uuid UUID for the device
+ * @param [in] device device ordinal
  *
  * @beta This API is marked as beta, meaning, while this is feature complete,
  * it is still open to changes and may have outstanding issues.
@@ -1464,11 +1464,11 @@ hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
  */
 hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device);
 /**
- * @brief Returns a value for attr of link between two devices
- * @param [out] value
- * @param [in] attr
- * @param [in] srcDevice
- * @param [in] dstDevice
+ * @brief Returns a value for attribute of link between two devices
+ * @param [out] value Pointer of the value for the attrubute
+ * @param [in] attr enum of hipDeviceP2PAttr to query
+ * @param [in] srcDevice The source device of the link
+ * @param [in] dstDevice The destination device of the link
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
@@ -1476,25 +1476,25 @@ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
                                     int srcDevice, int dstDevice);
 /**
  * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
- * @param [out] pciBusId
- * @param [in] len
- * @param [in] device
+ * @param [out] pciBusId The string of PCI Bus Id format for the device 
+ * @param [in] len Maximum length of string
+ * @param [in] device The device ordinal
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
 /**
  * @brief Returns a handle to a compute device.
- * @param [out] device handle
- * @param [in] PCI Bus ID
+ * @param [out] device The handle of the device
+ * @param [in] PCI The string of PCI Bus Id for the device
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
  */
 hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
 /**
  * @brief Returns the total amount of memory on the device.
- * @param [out] bytes
- * @param [in] device
+ * @param [out] bytes The size of memory in bytes, on the device
+ * @param [in] device The ordinal of the device
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice
  */
@@ -1674,7 +1674,7 @@ hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
 /**
  * @brief Set L1/Shared cache partition.
  *
- * @param [in] cacheConfig
+ * @param [in] cacheConfig Cache configuration
  *
  * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorNotSupported
  *
@@ -1686,7 +1686,7 @@ hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
 /**
  * @brief Get Cache configuration for a specific Device
  *
- * @param [out] cacheConfig
+ * @param [out] cacheConfig Pointer of cache configuration
  *
  * @returns #hipSuccess, #hipErrorNotInitialized
  * Note: AMD devices do not support reconfigurable cache. This hint is ignored
@@ -1722,7 +1722,7 @@ hipError_t hipDeviceSetLimit ( enum hipLimit_t limit, size_t value );
 /**
  * @brief Returns bank width of shared memory for current device
  *
- * @param [out] pConfig
+ * @param [out] pConfig The pointer of the bank width for shared memory 
  *
  * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
  *
@@ -1734,7 +1734,7 @@ hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
 /**
  * @brief Gets the flags set for current device
  *
- * @param [out] flags
+ * @param [out] flags Pointer of the flags 
  *
  * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
  */
@@ -1742,7 +1742,7 @@ hipError_t hipGetDeviceFlags(unsigned int* flags);
 /**
  * @brief The bank width of shared memory on current device is set
  *
- * @param [in] config
+ * @param [in] config Configuration for the bank width of shared memory
  *
  * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
  *
@@ -1754,7 +1754,7 @@ hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config);
 /**
  * @brief The current device behavior is changed according the flags passed.
  *
- * @param [in] flags
+ * @param [in] flags Flag to set on the current device 
  *
  * The schedule flags impact how HIP waits for the completion of a command running on a device.
  * hipDeviceScheduleSpin         : HIP runtime will actively spin in the thread which submitted the
@@ -1779,8 +1779,8 @@ hipError_t hipSetDeviceFlags(unsigned flags);
 /**
  * @brief Device which matches hipDeviceProp_t is returned
  *
- * @param [out] device ID
- * @param [in]  device properties pointer
+ * @param [out] device Pointer of the device
+ * @param [in]  prop Pointer of the properties
  *
  * @returns #hipSuccess, #hipErrorInvalidValue
  */
@@ -1930,9 +1930,9 @@ hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle);
 /**
  * @brief Set attribute for a specific function
  *
- * @param [in] func;
- * @param [in] attr;
- * @param [in] value;
+ * @param [in] func Pointer of the function
+ * @param [in] attr Attribute to set
+ * @param [in] value Value to set
  *
  * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
  *
@@ -1944,7 +1944,7 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
 /**
  * @brief Set Cache configuration for a specific function
  *
- * @param [in] config;
+ * @param [in] config Configuration to set
  *
  * @returns #hipSuccess, #hipErrorNotInitialized
  * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache.  This hint is ignored
@@ -1955,8 +1955,8 @@ hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
 /**
  * @brief Set shared memory configuation for a specific function
  *
- * @param [in] func
- * @param [in] config
+ * @param [in] func Pointer of the function
+ * @param [in] config Configuration
  *
  * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
  *

From e63f21bc4b04bd1703a0f59b84645edbde4e33cb Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Fri, 24 Feb 2023 18:36:03 -0500
Subject: [PATCH 35/38] SWDEV-385406 - [hipcc] Pass HIP_PATH to clang

The HIP_PATH env var has been broken by an earlier patch to change the
linking to the HIP runtime with --hip-link in hipcc.  We need to detect
when the HIP_PATH env var is defined, we have to pass that to the clang.

Change-Id: Iea939893844cce426d4bc4ace3539fc241363ff3
---
 bin/hipcc.pl | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/bin/hipcc.pl b/bin/hipcc.pl
index 6898cd0bba..2cd3752906 100644
--- a/bin/hipcc.pl
+++ b/bin/hipcc.pl
@@ -52,15 +52,21 @@
 
 # retrieve --rocm-path hipcc option from command line.
 # We need to respect this over the env var ROCM_PATH for this compilation.
-sub get_rocm_path_option {
+sub get_path_options {
   my $rocm_path="";
+  my $hip_path="";
   my @CLArgs = @ARGV;
   foreach $arg (@CLArgs) {
     if (index($arg,"--rocm-path=") != -1) {
       ($rocm_path) = $arg=~ /=\s*(.*)\s*$/;
+      next;
+    }
+    if (index($arg,"--hip-path=") != -1) {
+      ($hip_path) = $arg=~ /=\s*(.*)\s*$/;
+      next;
     }
   }
-  return $rocm_path;
+  return ($rocm_path, $hip_path);
 }
 
 $verbose = $ENV{'HIPCC_VERBOSE'} // 0;
@@ -99,13 +105,16 @@ sub delete_temp_dirs {
 }
 
 my $base_dir;
-my $rocmPath;
 BEGIN {
     $base_dir = dirname(Cwd::realpath(__FILE__) );
-    $rocmPath = get_rocm_path_option();
-    if ($rocmPath ne '') {
+    my ($rocm_path, $hip_path) = get_path_options();
+    if ($rocm_path ne '') {
+      # --rocm-path takes precedence over ENV{ROCM_PATH}
+      $ENV{ROCM_PATH}=$rocm_path;
+    }
+    if ($hip_path ne '') {
       # --rocm-path takes precedence over ENV{ROCM_PATH}
-      $ENV{ROCM_PATH}=$rocmPath;
+      $ENV{HIP_PATH}=$hip_path;
     }
 }
 use lib "$base_dir/";
@@ -566,6 +575,13 @@ BEGIN
         }
     }
 
+    # If the HIP_PATH env var is defined, pass that path to Clang
+    if ($ENV{'HIP_PATH'}) {
+        my $hip_path_flag = " --hip-path=\"$HIP_PATH\"";
+        $HIPCXXFLAGS .= $hip_path_flag;
+        $HIPLDFLAGS .= $hip_path_flag;
+    }
+
     if ($hasHIP) {
         if ($DEVICE_LIB_PATH ne "$ROCM_PATH/amdgcn/bitcode") {
             $HIPCXXFLAGS .= " --hip-device-lib-path=\"$DEVICE_LIB_PATH\"";

From b3ffe6ab1214cdee42864380b07e1f04e621e6c1 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Wed, 8 Feb 2023 12:31:47 -0500
Subject: [PATCH 36/38] SWDEV-354557 - correct typo in HIP direct dispatch
 document

Change-Id: Iff905b916c13c3fffd38c8c4e3ddc910df21caa3
---
 docs/markdown/hip_programming_guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md
index 507a72c502..80b50b96e5 100644
--- a/docs/markdown/hip_programming_guide.md
+++ b/docs/markdown/hip_programming_guide.md
@@ -108,9 +108,9 @@ A stronger system-level fence can be specified when the event is created with hi
 - HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
 
 ## Direct Dispatch
-HIP runtime has Direct Dispatch enabled by default in ROCM 4.4. With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, where as the host thread(producer) enqueues commands to a command queue(per stream).
+HIP runtime has Direct Dispatch enabled by default in ROCM 4.4. With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, and the host thread(producer) enqueues commands to a command queue(per stream).
 
-For Direct Dispatch, the runtime would directly queue a packet to the AQL queue (user mode queue to GPU) in case of Dispatch and some of the synchronization. This has shown to the total latency of the HIP Dispatch API and latency to launch first wave on the GPU.
+For Direct Dispatch, HIP runtime would directly enqueue a packet to the AQL queue (user mode queue on GPU) on the Dispatch API call from the application. That has shown to reduce the latency to launch the first wave on the idle GPU and total time of tiny dispatches synchronized with the host.
 
 In addition, eliminating the threads in runtime has reduced the variance in the dispatch numbers as the thread scheduling delays and atomics/locks synchronization latencies are reduced.
 

From f3d8fc881cacdea8c6dd33ec7733224eb2513a58 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Thu, 27 Apr 2023 13:58:40 +0000
Subject: [PATCH 37/38] SWDEV-397344 - Bump patch version to 30202

Change-Id: Ida2d603006218a8c74fd58b81830ac7802ab58d1
---
 VERSION        | 2 +-
 bin/hipvars.pm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index 9848f824d1..ff162666ec 100644
--- a/VERSION
+++ b/VERSION
@@ -3,4 +3,4 @@
 #HIP_VERSION_MINOR
 5
 #HIP_VERSION_PATCH
-30201
+30202
diff --git a/bin/hipvars.pm b/bin/hipvars.pm
index c0c2eaa3f8..94b3bfad96 100644
--- a/bin/hipvars.pm
+++ b/bin/hipvars.pm
@@ -27,7 +27,7 @@ use File::Basename;
 
 $HIP_BASE_VERSION_MAJOR = "5";
 $HIP_BASE_VERSION_MINOR = "5";
-$HIP_BASE_VERSION_PATCH = "30201";
+$HIP_BASE_VERSION_PATCH = "30202";
 
 #---
 # Function to parse config file

From 9837883d6998a87f9d978d216342f54a0ab0d566 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Tue, 25 Apr 2023 18:32:32 +0100
Subject: [PATCH 38/38] SWDEV-398296/SWDEV-393199 - Added new include file for
 opengl interop mappings for nvidia

Change-Id: Ic823cec1eb972ece4029595ebf2d52f569af444f
---
 include/hip/hip_gl_interop.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 include/hip/hip_gl_interop.h

diff --git a/include/hip/hip_gl_interop.h b/include/hip/hip_gl_interop.h
new file mode 100644
index 0000000000..207beea3df
--- /dev/null
+++ b/include/hip/hip_gl_interop.h
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIP_GL_INTEROP_H
+#define HIP_GL_INTEROP_H
+
+#include <hip/hip_common.h>
+
+#if !(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && (defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
+#include "hip/nvidia_detail/nvidia_hip_gl_interop.h"
+#endif
+#endif
\ No newline at end of file