diff --git a/include/acl_hal.h b/include/acl_hal.h index e0fb58e3..e11ecb7d 100644 --- a/include/acl_hal.h +++ b/include/acl_hal.h @@ -57,12 +57,12 @@ struct acl_pkg_file; /// @name Callback type declarations ///@{ typedef void (*acl_event_update_callback)(cl_event event, int new_status); -typedef void (*acl_kernel_update_callback)(int activation_id, cl_int status); -typedef void (*acl_profile_callback)(int activation_id); +typedef void (*acl_kernel_update_callback)(unsigned int physical_device_id, int activation_id, cl_int status); +typedef void (*acl_profile_callback)(unsigned int physical_device_id, int activation_id); typedef void (*acl_device_update_callback)( unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type, void *user_private_info, size_t user_cb); -typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size, +typedef void (*acl_process_printf_buffer_callback)(unsigned int physical_device_id, int activation_id, int size, int debug_dump_printf); ///@} diff --git a/include/acl_kernel.h b/include/acl_kernel.h index 1a6f88ef..947d9146 100644 --- a/include/acl_kernel.h +++ b/include/acl_kernel.h @@ -41,12 +41,12 @@ void acl_launch_kernel(void *user_data, acl_device_op_t *op); // Called when we get a kernel interrupt indicating that profiling data is ready ACL_EXPORT -void acl_profile_update(int activation_id); +void acl_profile_update(unsigned int physical_device_id, int activation_id); // This should be called by the HAL, to receive notification of RUNNING and // COMPLETE state transitions, and used printf buffer size ACL_EXPORT -void acl_receive_kernel_update(int activation_id, cl_int status); +void acl_receive_kernel_update(unsigned int physical_device_id, int activation_id, cl_int status); // Used to check if one of the kernel arguments needs to be mapped to the device // When unmapping subbuffers we may transfer memory that is currently used diff --git a/include/acl_platform.h b/include/acl_platform.h index b852e51b..43f8c18c 100644 --- a/include/acl_platform.h +++ b/include/acl_platform.h @@ -22,6 +22,11 @@ void acl_init_platform(void); void acl_finalize_init_platform(unsigned int num_devices, const cl_device_id *devices); const char *acl_platform_extensions(void); +acl_device_op_queue_t *get_device_op_queue(unsigned int physical_device_id); +acl_device_op_queue_t *get_device_op_queue_from_context(cl_context context); + +acl_locking_data_t *get_device_op_queue_locking_data(cl_device_id device); +acl_locking_data_t *get_device_op_queue_locking_data_from_context(cl_context context); #if defined(__cplusplus) } /* extern "C" */ diff --git a/include/acl_printf.h b/include/acl_printf.h index f8664f37..561ab241 100644 --- a/include/acl_printf.h +++ b/include/acl_printf.h @@ -18,7 +18,7 @@ extern "C" { // Enqueue printf buffer dump ACL_EXPORT -void acl_schedule_printf_buffer_pickup(int activation_id, int size, +void acl_schedule_printf_buffer_pickup(unsigned int physical_device_id, int activation_id, int size, int overflow); // Print the printf data associated with the given deviced operation diff --git a/include/acl_thread.h b/include/acl_thread.h index 753adf47..6b36c11e 100644 --- a/include/acl_thread.h +++ b/include/acl_thread.h @@ -4,14 +4,18 @@ #ifndef ACL_THREAD_H #define ACL_THREAD_H -#include "acl.h" -#include "acl_context.h" -#include "acl_types.h" - +// System headers. #include #include #include +// External library headers. +#include + +// Internal headers. +#include "acl.h" + + #if defined(__cplusplus) extern "C" { #endif @@ -23,6 +27,22 @@ extern "C" { #define ACL_TLS __declspec(thread) #endif + +/* An opaque type for critical section + condition variable. + * Use indirection here so we don't force every module in the world to pull + * in windows.h. + */ +// typedef struct acl_condvar_s *acl_condvar_t; + +typedef struct acl_locking_data_s acl_locking_data_t; +struct acl_locking_data_s { + struct acl_condvar_s condvar; + int lock_count; + int inside_sig_flag; + int inside_sig_old_lock_count; +}; + + extern ACL_TLS int acl_global_lock_count; extern ACL_TLS int acl_inside_sig_flag; extern ACL_TLS int acl_inside_sig_old_lock_count; @@ -38,23 +58,46 @@ extern ACL_TLS int acl_inside_sig_old_lock_count; // If a function needs an assert that passes if either the lock is held or // inside a signal handler, it can use "acl_assert_locked_or_sig()". -static inline int acl_is_inside_sig() { return acl_inside_sig_flag; } +static inline int acl_is_inside_sig(acl_locking_data_t *locking_data = nullptr) { + if (locking_data == nullptr) { + return acl_inside_sig_flag; + } else { + return locking_data->inside_sig_flag; + } +} -static inline void acl_assert_inside_sig() { assert(acl_is_inside_sig()); } +static inline void acl_assert_inside_sig(acl_locking_data_t *locking_data = nullptr) { + assert(acl_is_inside_sig(locking_data)); +} -static inline void acl_assert_outside_sig() { assert(!acl_is_inside_sig()); } +static inline void acl_assert_outside_sig(acl_locking_data_t *locking_data = nullptr) { + assert(!acl_is_inside_sig(locking_data)); +} -static inline void acl_sig_started() { - assert(!acl_inside_sig_flag); - acl_inside_sig_flag = 1; - acl_inside_sig_old_lock_count = acl_global_lock_count; - acl_global_lock_count = 0; +static inline void acl_sig_started(acl_locking_data_t *locking_data = nullptr) { + if (locking_data == nullptr) { + assert(!acl_inside_sig_flag); + acl_inside_sig_flag = 1; + acl_inside_sig_old_lock_count = acl_global_lock_count; + acl_global_lock_count = 0; + } else { + assert(!locking_data->inside_sig_flag); + locking_data->inside_sig_flag = 1; + locking_data->inside_sig_old_lock_count = locking_data->lock_count; + locking_data->lock_count = 0; + } } -static inline void acl_sig_finished() { - assert(acl_inside_sig_flag); - acl_inside_sig_flag = 0; - acl_global_lock_count = acl_inside_sig_old_lock_count; +static inline void acl_sig_finished(acl_locking_data_t *locking_data = nullptr) { + if (locking_data == nullptr) { + assert(acl_inside_sig_flag); + acl_inside_sig_flag = 0; + acl_global_lock_count = acl_inside_sig_old_lock_count; + } else { + assert(locking_data->inside_sig_flag); + locking_data->inside_sig_flag = 0; + locking_data->lock_count = locking_data->inside_sig_old_lock_count; + } } // Blocking/Unblocking signals (Only implemented for Linux) @@ -75,31 +118,41 @@ static inline void acl_sig_unblock_signals() { // -- global lock functions -- -void acl_lock(); -void acl_unlock(); -int acl_suspend_lock(); -void acl_resume_lock(int lock_count); +void acl_lock(acl_locking_data_t *locking_data = nullptr); +void acl_unlock(acl_locking_data_t *locking_data = nullptr); +int acl_suspend_lock(acl_locking_data_t *locking_data = nullptr); +void acl_resume_lock(int lock_count, acl_locking_data_t *locking_data = nullptr); void acl_wait_for_device_update(cl_context context); -void acl_signal_device_update(); - -static inline int acl_is_locked() { return (acl_global_lock_count > 0); } +void acl_signal_device_update(acl_locking_data_t *locking_data = nullptr); + +static inline int acl_is_locked(acl_locking_data_t *locking_data = nullptr) { + if (locking_data == nullptr) { + return acl_global_lock_count > 0; + } else { + return (locking_data->lock_count > 0); + } +} // Used by dynamically loaded libs to check lock status. -int acl_is_locked_callback(void); +int acl_is_locked_callback(acl_locking_data_t *locking_data = nullptr); -static inline void acl_assert_locked() { assert(acl_is_locked()); } +static inline void acl_assert_locked(acl_locking_data_t *locking_data = nullptr) { + assert(acl_is_locked(locking_data)); +} -static inline void acl_assert_locked_or_sig() { - assert(acl_is_locked() || acl_is_inside_sig()); +static inline void acl_assert_locked_or_sig(acl_locking_data_t *locking_data = nullptr) { + assert(acl_is_locked(locking_data) || acl_is_inside_sig(locking_data)); } -static inline void acl_assert_unlocked() { assert(!acl_is_locked()); } +static inline void acl_assert_unlocked(acl_locking_data_t *locking_data = nullptr) { + assert(!acl_is_locked(locking_data)); +} // -- misc functions -- int acl_get_thread_id(); int acl_get_pid(); -void acl_yield_lock_and_thread(); +void acl_yield_lock_and_thread(acl_locking_data_t *locking_data = nullptr); #if defined(__cplusplus) } /* extern "C" */ diff --git a/include/acl_types.h b/include/acl_types.h index 0536838e..7d652b22 100644 --- a/include/acl_types.h +++ b/include/acl_types.h @@ -24,6 +24,7 @@ #include "acl_device_binary.h" #include "acl_hal.h" #include "acl_icd_dispatch.h" +#include "acl_thread.h" #if defined(__cplusplus) extern "C" { @@ -235,12 +236,6 @@ typedef enum { */ #define CL_CONTEXT_COMPILE_COMMAND_INTELFPGA ACL_EXPERIMENTAL_ENUM(1) -/* An opaque type for critical section + condition variable. - * Use indirection here so we don't force every module in the world to pull - * in windows.h. - */ -typedef struct acl_condvar_s *acl_condvar_t; - typedef enum { ACL_INVALID_EXECUTION_TRANSITION = -1, ACL_INVALID_EXECUTION_STATUS = -2, @@ -981,6 +976,7 @@ typedef struct _cl_context { cl_uint refcount; acl_compiler_mode_t compiler_mode; + // Is this context in the middle of being freed? // Fix re-entrancy of clReleaseContext. int is_being_freed; @@ -1524,6 +1520,11 @@ typedef struct acl_device_op_queue_t { acl_device_op_stats_t stats; +// per-context condition variable for finer-grained locking. + // Only operations on devices in the current context are proected + // by this condvar. + acl_locking_data_t locking_data; + // The operations themselves. acl_device_op_t op[ACL_MAX_DEVICE_OPS]; @@ -1531,6 +1532,9 @@ typedef struct acl_device_op_queue_t { // Used for checking if the device has concurrent read/write support acl_device_def_t *devices[ACL_MAX_DEVICE]; + // Number of physical devices managed by this queue + int num_managed_devices; + // These function pointers must be set to the actions to be taken when // kicking off various device activities. void (*launch_kernel)(void *, acl_device_op_t *); @@ -1633,7 +1637,22 @@ typedef struct _cl_platform_id // The device operation queue. // These are the operations that can run immediately on the device. - acl_device_op_queue_t device_op_queue; + + // Map from physical device id to device op queue that this device belongs + // to. All devices in a single context belong to the same device op queue. + // If multiple contexts share even a single device, all devices in all these + // contexts share a single device op queue. Only if multiple contexts do not + // share even a single device will these devices belong to separate device + // op queues. + int physical_dev_id_to_doq_idx[ACL_MAX_DEVICE]; // [0..num_devices-1] + + // Array of device_op_queues. A new queue Will be allocated as required + // during platform init time + int num_device_op_queues; + acl_device_op_queue_t *device_op_queues[ACL_MAX_DEVICE]; // [0..num_devices-1] + + // TODO: REMOVE ME + // acl_device_op_queue_t device_op_queue; // Limits. See clGetDeviceInfo for semantics. unsigned int max_param_size; diff --git a/src/acl_command_queue.cpp b/src/acl_command_queue.cpp index e51b73f6..2b8d886c 100644 --- a/src/acl_command_queue.cpp +++ b/src/acl_command_queue.cpp @@ -625,7 +625,7 @@ int acl_update_queue(cl_command_queue command_queue) { } // First nudge the device operation scheduler. - acl_update_device_op_queue(&(acl_platform.device_op_queue)); + acl_update_device_op_queue(get_device_op_queue_from_context(command_queue->context)); if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { return acl_update_ooo_queue(command_queue); diff --git a/src/acl_context.cpp b/src/acl_context.cpp index c22c2f09..5784c902 100644 --- a/src/acl_context.cpp +++ b/src/acl_context.cpp @@ -316,6 +316,19 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) { context->device[i]->last_bin->unload_content(); } +#if 0 + // TODO: Remove devices from device op queue that manages them. + // If a device op queue does not manage any devices, de-allocate it. + for (unsigned i = 0; i < context->num_devices; i++) { + unsigned int physical_device_id = context->device[i]->def.physical_device_id; + int cur_doq_idx = acl_platform.physical_dev_id_to_doq_idx[physical_device_id]; + acl_platform.physical_dev_id_to_doq_idx[physical_device_id] = -1; + + acl_device_op_queue_t *cur_doq = acl_platform.device_op_queues[cur_doq_idx]; + cur_doq->num_managed_devices--; + } +#endif + // We have to close all devices associated with this context so they can be // opened by other processes acl_get_hal()->close_devices(context->num_devices, context->device); @@ -343,6 +356,22 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) { acl_free(context->command_queue); } + +#if 0 + // disconnect devices managed by this context from the device op queue that + // manages them. + for (int i = 0; i < acl_platform.num_device_op_queues; i++) { + if (acl_platform.device_op_queues[i] != nullptr && + acl_platform.device_op_queues[i]->num_managed_devices == 0) { + // Should all the ops on this queue be done by now? I hope so, we're about to + // delete the context! + acl_print_debug_msg("Deleting device op queue %d as no devices are managed by it\n", i); + //acl_free (acl_platform.device_op_queues[i]); + //acl_platform.device_op_queues[i] = nullptr; + } + } +#endif + clReleaseMemObject(context->unwrapped_host_mem); l_forcibly_release_allocations(context); @@ -1043,6 +1072,8 @@ static void l_forcibly_release_allocations(cl_context context) { acl_release(context->device[idevice]); } + // acl_ + // Buffers might have been allocated. acl_forcibly_release_all_memory_for_context(context); acl_forcibly_release_all_svm_memory_for_context(context); diff --git a/src/acl_device_op.cpp b/src/acl_device_op.cpp index fe10af2d..20421d0c 100644 --- a/src/acl_device_op.cpp +++ b/src/acl_device_op.cpp @@ -240,6 +240,12 @@ void acl_init_device_op_queue_limited(acl_device_op_queue_t *doq, acl_device_op_reset_device_op(doq->op + i); } + // Init locking data for this context + acl_init_condvar(&(doq->locking_data.condvar)); + doq->locking_data.lock_count = 0; + doq->locking_data.inside_sig_flag = 0; + doq->locking_data.inside_sig_old_lock_count = 0; + // The live lists are all empty. doq->first_live = ACL_OPEN; doq->last_committed = ACL_OPEN; @@ -264,6 +270,7 @@ void acl_init_device_op_queue_limited(acl_device_op_queue_t *doq, doq->usm_memcpy = acl_usm_memcpy; doq->log_update = 0; + doq->num_managed_devices = 0; for (i = 0; i < ACL_MAX_DEVICE; i++) { doq->devices[i] = NULL; } @@ -1444,23 +1451,27 @@ static void l_record_milestone(acl_device_op_t *op, ACL_EXPORT void acl_device_op_dump_stats(void) { #ifdef ACL_DEVICE_OP_STATS - acl_device_op_stats_t *stats = &(acl_platform.device_op_queue.stats); - acl_assert_locked(); - printf("Device op stats:\n"); - -#define PF(X) \ - printf(" %-25s %12u %12.6f\n", #X, stats->num_##X, \ - ((float)stats->num_##X / (float)stats->num_queue_updates)); - PF(queue_updates) - PF(exclusion_checks) - PF(conflict_checks) - PF(submits) - PF(live_op_pending_calcs) - PF(queued) - PF(running) - PF(complete) - fflush(stdout); -#undef PF + #define PF(X) \ + printf(" %-25s %12u %12.6f\n", #X, stats->num_##X, \ + ((float)stats->num_##X / (float)stats->num_queue_updates)); + + for (int iq = 0; iq < acl_platform.num_device_op_queues; iq++) { + if (acl_platform.device_op_queues[iq] == nullptr) continue; + acl_device_op_stats_t *stats = &(acl_platform.device_op_queues[iq]->stats); + acl_assert_locked(); + printf("Device op stats:\n"); + + PF(queue_updates) + PF(exclusion_checks) + PF(conflict_checks) + PF(submits) + PF(live_op_pending_calcs) + PF(queued) + PF(running) + PF(complete) + fflush(stdout); + } + #undef PF #else printf("Device op stats are not available\n"); #endif diff --git a/src/acl_hal_mmd.cpp b/src/acl_hal_mmd.cpp index 27fef0f6..b4612bbb 100644 --- a/src/acl_hal_mmd.cpp +++ b/src/acl_hal_mmd.cpp @@ -2632,7 +2632,8 @@ void acl_hal_mmd_reset_kernels(cl_device_id device) { int activation_id = kern[physical_device_id].accel_job_ids[k][i]; if (kern[physical_device_id].accel_job_ids[k][i] >= 0) { kern[physical_device_id].accel_job_ids[k][i] = -1; - acl_kernel_update_fn(activation_id, + acl_kernel_update_fn(physical_device_id, + activation_id, -1); // Signal that it finished with error, since // we forced it to finish } diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index 70c562e8..7ae60162 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1548,6 +1549,14 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueNDRangeKernelIntelFPGA( const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { cl_int ret; + +#if 0 + acl_locking_data_t *locking_data = nullptr; + if (command_queue != nullptr) { + locking_data = get_device_op_queue_locking_data_from_context(command_queue->context); + } + acl_lock(locking_data); + #endif acl_lock(); ret = l_enqueue_kernel_with_type( @@ -3006,7 +3015,7 @@ int acl_submit_kernel_device_op(cl_event event) { return result; } - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(event->context); acl_device_op_t *last_op = 0; int ok = 1; @@ -3137,8 +3146,8 @@ void acl_launch_kernel(void *user_data, acl_device_op_t *op) { } // Called when we get a kernel interrupt indicating that profiling data is ready -void acl_profile_update(int activation_id) { - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); +void acl_profile_update(unsigned int physical_device_id, int activation_id) { + acl_device_op_queue_t *doq = get_device_op_queue(physical_device_id); if (activation_id >= 0 && activation_id < doq->max_ops) { // This address is stable, given a fixed activation_id. @@ -3151,8 +3160,8 @@ void acl_profile_update(int activation_id) { // Handle a status update from within a HAL interrupt. // We can't do much: only update a flag in the right spot. -void acl_receive_kernel_update(int activation_id, cl_int status) { - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); +void acl_receive_kernel_update(unsigned int physical_device_id, int activation_id, cl_int status) { + acl_device_op_queue_t *doq = get_device_op_queue(physical_device_id); // This function can potentially be called by a HAL that does not use the // ACL global lock, so we need to use acl_lock() instead of diff --git a/src/acl_kernel_if.cpp b/src/acl_kernel_if.cpp index 9501716a..116fc845 100644 --- a/src/acl_kernel_if.cpp +++ b/src/acl_kernel_if.cpp @@ -1250,7 +1250,7 @@ void acl_kernel_if_launch_kernel_on_custom_sof( // next to launch, its status will be set to CL_RUNNING and below call // to update status will do nothing if (kern->accel_queue_front[accel_id] == kern->accel_queue_back[accel_id]) { - acl_kernel_if_update_fn((int)(activation_id), CL_RUNNING); + acl_kernel_if_update_fn(kern->physical_device_id, (int)(activation_id), CL_RUNNING); } kern->accel_queue_front[accel_id] = next_launch_index; @@ -1394,7 +1394,7 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { activation_id, printf_size); // update status, which will dump the printf buffer, set // debug_dump_printf = 0 - acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0); + acl_process_printf_buffer_fn(kern->physical_device_id, activation_id, (int)printf_size, 0); ACL_KERNEL_IF_DEBUG_MSG( kern, ":: Accelerator %d new csr is %x.\n", k, @@ -1432,7 +1432,7 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { // This is an autorun kernel acl_process_autorun_profiler_scan_chain(kern->physical_device_id, k); } else { - acl_kernel_profile_fn(activation_id); + acl_kernel_profile_fn(kern->physical_device_id, activation_id); } continue; } @@ -1482,7 +1482,7 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { ":: Calling acl_process_printf_buffer_fn with " "activation_id=%d and printf_size=%u.\n", activation_id, printf_size); - acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0); + acl_process_printf_buffer_fn(kern->physical_device_id, activation_id, (int)printf_size, 0); } // Executing the following update after reading from performance @@ -1491,7 +1491,7 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { // completion timestamp - reading performance results through slave // ports before setting CL_COMPLETE adds to the apparent kernel time. // - acl_kernel_if_update_fn(activation_id, CL_COMPLETE); + acl_kernel_if_update_fn(kern->physical_device_id, activation_id, CL_COMPLETE); kern->accel_queue_back[k] = next_queue_back; if (kern->accel_queue_back[k] == @@ -1501,7 +1501,8 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) { next_queue_back = kern->accel_queue_back[k] + 1; if (kern->accel_job_ids[k][next_queue_back] > -1) { - acl_kernel_if_update_fn(kern->accel_job_ids[k][next_queue_back], + acl_kernel_if_update_fn(kern->physical_device_id, + kern->accel_job_ids[k][next_queue_back], CL_RUNNING); } } @@ -1542,7 +1543,7 @@ void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) { activation_id, printf_size); // set debug_dump_printf to 1 - acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1); + acl_process_printf_buffer_fn(kern->physical_device_id, activation_id, (int)printf_size, 1); } } diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index af10328d..d92f1b17 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -5312,7 +5312,7 @@ int acl_submit_mem_transfer_device_op(cl_event event) { return result; } - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(event->context); acl_device_op_t *last_op = 0; int src_on_host; int dst_on_host; @@ -7045,7 +7045,7 @@ int acl_submit_migrate_mem_device_op(cl_event event) { unsigned int ibuf; int ok = 1; acl_mem_migrate_t memory_migration = event->cmd.info.memory_migration; - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(event->context); acl_device_op_t *last_op = 0; // Precautionary, but it also nudges the device scheduler to try diff --git a/src/acl_offline_hal.cpp b/src/acl_offline_hal.cpp index 1e2aaefc..ae033ca6 100644 --- a/src/acl_offline_hal.cpp +++ b/src/acl_offline_hal.cpp @@ -269,10 +269,9 @@ static void acl_offline_hal_launch_kernel( cl_int activation_id = invocation_wrapper->image->activation_id; acl_assert_locked(); - physical_id = physical_id; // For emulating an offline device, just say we start and finish right away. - acl_offline_hal_kernel_callback(activation_id, CL_RUNNING); - acl_offline_hal_kernel_callback(activation_id, CL_COMPLETE); + acl_offline_hal_kernel_callback(physical_id, activation_id, CL_RUNNING); + acl_offline_hal_kernel_callback(physical_id, activation_id, CL_COMPLETE); } static void acl_offline_hal_unstall_kernel(unsigned int physical_id, diff --git a/src/acl_platform.cpp b/src/acl_platform.cpp index 0a6ba6e5..8544a566 100644 --- a/src/acl_platform.cpp +++ b/src/acl_platform.cpp @@ -6,6 +6,7 @@ #include #include #include +#include // for sort, binary_search #ifdef _WIN32 #include @@ -406,12 +407,19 @@ void acl_init_platform(void) { for (unsigned int i = 0; i < acl_platform.num_devices; i++) { // initialize static information for these devices l_add_device(static_cast(i)); + } l_initialize_offline_devices(offline_mode); - // Device operation queue. - acl_init_device_op_queue(&acl_platform.device_op_queue); + // Device operation queue(s) and related map will be + // allocated during the finalize stage + acl_platform.num_device_op_queues = 0; + for (unsigned int i = 0; i < ACL_MAX_DEVICE; i++) { + acl_platform.physical_dev_id_to_doq_idx[i] = -1; + acl_platform.device_op_queues[i] = nullptr; + } + //acl_init_device_op_queue(&acl_platform.device_op_queue); // Initialize sampler allocator. for (int i = 0; i < ACL_MAX_SAMPLER; i++) { @@ -653,6 +661,111 @@ static void l_initialize_offline_devices(int offline_mode) { l_show_devs("offline"); } + +#define my_debug_msg acl_print_debug_msg + +// Assign devices to device operation queues. +// All devices in one context are assigned to one dev op queue. +// If two or more contexts share at least one device, all devices +// in all such contexts are assigned to a single dev op queue. +// Lowested numbered dev op queue is preferred, for ease of indexing +// acl_platform.device_op_queues[] +// This function is called is called ONCE PER CONTEXT, therefore it +// assumes that set of devices in each call belong to different contexts. +void l_assign_devices_to_dev_op_queues (unsigned int num_devices, const cl_device_id *devices) { + + unsigned int i, j; + acl_assert_locked(); + std::vector doq_idx_found; + + my_debug_msg("l_assign_devices_to_dev_op_queues with %d devices\n", num_devices); + + // find lowest device op queue index for devices in this context + unsigned int num_platform_devices = acl_platform.num_devices; + for (i = 0; i < num_platform_devices; i++) { + for (j = 0; j < num_devices; j++) { + if (&(acl_platform.device[i]) == devices[j]) { + int doq_idx = acl_platform.physical_dev_id_to_doq_idx[i]; + if ((doq_idx > -1)) { + my_debug_msg(" device id %d already belongs to queue idx %d\n", i, doq_idx); + doq_idx_found.push_back(doq_idx); + } + break; + } + } + } + + + int cur_doq_idx = -1; + if (!doq_idx_found.empty()) { + // pick lowest existing device op queue index + std::sort(doq_idx_found.begin(), doq_idx_found.end()); + cur_doq_idx = doq_idx_found[0]; + } else { + // no existing device op queues. Create a new one. + my_debug_msg(" did not find any device op queues for these devices. Creatig a new one #%d\n", acl_platform.num_device_op_queues); + cur_doq_idx = acl_platform.num_device_op_queues; + acl_platform.num_device_op_queues++; + acl_platform.device_op_queues[cur_doq_idx] = + (acl_device_op_queue_t *)acl_malloc(sizeof(acl_device_op_queue_t)); + + acl_init_device_op_queue(acl_platform.device_op_queues[cur_doq_idx]); + } + + my_debug_msg(" all affected devices will be assigned to doq #%d\n", cur_doq_idx); + + // all devices in passed in devices[] array are now on cur_doq_idx queue + // AND all devices that are on queues in doq_idx_found list as well! + + // iterator i is the physical device id + for (i = 0; i < num_platform_devices; i++) { + + int doq_idx = acl_platform.physical_dev_id_to_doq_idx[i]; + bool doq_idx_in_found_list = std::binary_search(doq_idx_found.begin(), doq_idx_found.end(), doq_idx); + if (doq_idx_in_found_list && (doq_idx != cur_doq_idx)) { + + // this device is already used by another context. + // It is also using the queue to be merged. + acl_platform.device_op_queues[doq_idx]->num_managed_devices--; + acl_platform.device_op_queues[cur_doq_idx]->num_managed_devices++; + + acl_platform.physical_dev_id_to_doq_idx[i] = cur_doq_idx; + my_debug_msg(" device %d reassigned from doq #%d doq #%d\n", i, doq_idx, cur_doq_idx); + // TODO: ARE ALL OPERATIONS ON THE doq_idx QUEUE ALREADY DONE????? + // CONCERN IF THE USER CREATES A NEW CONTEXT WITH OVER-LAPPING + // DEVICES BEFORE LETTING CURRENT OPS FINISH. + } + if (doq_idx == -1) { + // a device without an assigned device op queue may be in the passed + // devices[] list. If that's the case, assign it to cur_doq_idx queue. + // Will happen if this is the first context that is using this device. + for (j = 0; j < num_devices; j++) { + if (&(acl_platform.device[i]) == devices[j]) { + acl_platform.physical_dev_id_to_doq_idx[i] = cur_doq_idx; + acl_platform.device_op_queues[cur_doq_idx]->num_managed_devices++; + my_debug_msg(" device %d is assigned to doq #%d\n", i, cur_doq_idx); + break; + } + } + } + } + + // NOTE: De-allocate unneeded device op queues when platform is + // shut down. It is possible that the user will create another context + // after already submitting some work to existing queues. + // Find all device_op_queues with num_managed_devices = 0, wait for them + // to finish, then recycle them somehow. +} + + +// Free device op queue associated with these devices if this device +// op queue is not used by any other devices. +void acl_free_device_op_queue (unsigned int num_devices, + const cl_device_id *devices) { + +} + + // Initialize acl_platform with device information. // Also determine global mem address range. static void l_initialize_devices(const acl_system_def_t *present_board_def, @@ -668,6 +781,8 @@ static void l_initialize_devices(const acl_system_def_t *present_board_def, present_board_def->num_devices); } + l_assign_devices_to_dev_op_queues (num_devices, devices); + // shipped_board_def populated earlier in l_initialize_offline_devices if (offline_mode == ACL_CONTEXT_OFFLINE_AND_AUTODISCOVERY || @@ -956,6 +1071,48 @@ void acl_receive_device_exception(unsigned physical_device_id, } } +acl_device_op_queue_t *get_device_op_queue(unsigned int physical_device_id) { + + int idx = 0; + if (idx >= ACL_MAX_DEVICE) { + return nullptr; + } + idx = acl_platform.physical_dev_id_to_doq_idx[physical_device_id]; + if (idx == -1 || idx >= acl_platform.num_device_op_queues) { + return nullptr; + } else { + return acl_platform.device_op_queues[idx]; + } +} + +acl_device_op_queue_t *get_device_op_queue_from_context(cl_context context) { + if (context != nullptr && context->num_devices > 0) { + unsigned int physical_device_id = context->device[0]->def.physical_device_id; + return get_device_op_queue(physical_device_id); + } else { + return nullptr; + } +} + +acl_locking_data_t *get_device_op_queue_locking_data(cl_device_id device) { + unsigned int physical_device_id = device->def.physical_device_id; + acl_device_op_queue_t *doq = get_device_op_queue(physical_device_id); + if (doq != nullptr) { + return &doq->locking_data; + } else { + return nullptr; + } +} + +acl_locking_data_t *get_device_op_queue_locking_data_from_context(cl_context context) { + if (context != nullptr && context->num_devices > 0) { + // all devices in a context are on the same device op queue + return get_device_op_queue_locking_data(context->device[0]); + } else { + return nullptr; + } +} + ACL_EXPORT CL_API_ENTRY void CL_API_CALL clTrackLiveObjectsIntelFPGA(cl_platform_id platform) { diff --git a/src/acl_printf.cpp b/src/acl_printf.cpp index fa66a8b7..274a828d 100644 --- a/src/acl_printf.cpp +++ b/src/acl_printf.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -987,9 +988,9 @@ static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel, // // Schedule enqueue read buffer to read printf buffer // The activation ID is the device op ID. -void acl_schedule_printf_buffer_pickup(int activation_id, int size, +void acl_schedule_printf_buffer_pickup(unsigned int physical_device_id, int activation_id, int size, int debug_dump_printf) { - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue(physical_device_id); // This function can potentially be called by a HAL that does not use the // ACL global lock, so we need to use acl_lock() instead of diff --git a/src/acl_program.cpp b/src/acl_program.cpp index d7a55f7d..e53b2d09 100644 --- a/src/acl_program.cpp +++ b/src/acl_program.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -2035,7 +2036,7 @@ int acl_submit_program_device_op(cl_event event) { return result; } if (!event->last_device_op) { - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(event->context); acl_device_op_t *last_op = 0; // Precautionary, but it also nudges the device scheduler to try diff --git a/src/acl_thread.cpp b/src/acl_thread.cpp index d7b2e70c..21a222bf 100644 --- a/src/acl_thread.cpp +++ b/src/acl_thread.cpp @@ -11,8 +11,13 @@ #include #include #include +#include #include +// Global locking data. +// Just like members of acl_locking_data_t but with +// "static" on the l_acl_global_condvar and ACL_TLS on +// lock_counts and sig_flag. ACL_TLS int acl_global_lock_count = 0; ACL_TLS int acl_inside_sig_flag = 0; ACL_TLS int acl_inside_sig_old_lock_count = 0; @@ -22,53 +27,109 @@ static struct acl_condvar_s l_acl_global_condvar; // l_init_once() is defined in an OS-specific section below static void l_init_once(); -void acl_lock() { - l_init_once(); - if (acl_global_lock_count == 0) { - acl_acquire_condvar(&l_acl_global_condvar); +void acl_lock(acl_locking_data_t *locking_data) { + + if (locking_data == nullptr) { + // Condvar is not specified, so use the global one + l_init_once(); + if (acl_global_lock_count == 0) { + acl_acquire_condvar(&l_acl_global_condvar); + } + acl_global_lock_count++; + + } else { + + // Locking data (condvar and associated counters) is given. + // This condvar must have already been initialized during its owner's + // creation time. + if (locking_data->lock_count == 0) { + acl_acquire_condvar(&locking_data->condvar); + } + locking_data->lock_count++; } - acl_global_lock_count++; } -void acl_unlock() { - acl_assert_locked(); - acl_global_lock_count--; - if (acl_global_lock_count == 0) { - acl_release_condvar(&l_acl_global_condvar); +void acl_unlock(acl_locking_data_t *locking_data) { + + if (locking_data == nullptr) { + acl_assert_locked(); + acl_global_lock_count--; + if (acl_global_lock_count == 0) { + acl_release_condvar(&l_acl_global_condvar); + } + } else { + acl_assert_locked(locking_data); + locking_data->lock_count--; + if (locking_data->lock_count == 0) { + acl_release_condvar(&locking_data->condvar); + } + } +} + +int acl_is_locked_callback(acl_locking_data_t *locking_data) { + if (locking_data == nullptr) { + return (acl_global_lock_count > 0); + } else { + return (locking_data->lock_count > 0); } } -int acl_is_locked_callback(void) { return (acl_global_lock_count > 0); } +int acl_suspend_lock(acl_locking_data_t *locking_data) { -int acl_suspend_lock() { - int old_lock_count = acl_global_lock_count; - acl_global_lock_count = 0; - if (old_lock_count > 0) - acl_release_condvar(&l_acl_global_condvar); - return old_lock_count; + if (locking_data == nullptr) { + int old_lock_count = acl_global_lock_count; + acl_global_lock_count = 0; + if (old_lock_count > 0) + acl_release_condvar(&l_acl_global_condvar); + return old_lock_count; + } else { + int old_lock_count = locking_data->lock_count; + locking_data->lock_count = 0; + if (old_lock_count > 0) + acl_release_condvar(&locking_data->condvar); + return old_lock_count; + } } -void acl_resume_lock(int lock_count) { - acl_assert_unlocked(); - if (lock_count > 0) - acl_acquire_condvar(&l_acl_global_condvar); - acl_global_lock_count = lock_count; +void acl_resume_lock(int lock_count, acl_locking_data_t *locking_data) { + acl_assert_unlocked(locking_data); + if (locking_data == nullptr) { + if (lock_count > 0) { + acl_acquire_condvar(&l_acl_global_condvar); + } + acl_global_lock_count = lock_count; + } else { + if (lock_count > 0) { + acl_acquire_condvar(&locking_data->condvar); + } + locking_data->lock_count = lock_count; + } } void acl_wait_for_device_update(cl_context context) { + acl_locking_data_t *locking_data = get_device_op_queue_locking_data_from_context(context); + //acl_assert_locked(locking_data); acl_assert_locked(); if (acl_get_hal()->get_debug_verbosity && acl_get_hal()->get_debug_verbosity() > 0) { unsigned timeout = 5; // Seconds // Keep waiting until signal is received + //while (acl_timed_wait_condvar(&locking_data->condvar, timeout)) while (acl_timed_wait_condvar(&l_acl_global_condvar, timeout)) acl_context_print_hung_device_status(context); } else { + //acl_wait_condvar(&locking_data->condvar); acl_wait_condvar(&l_acl_global_condvar); } } -void acl_signal_device_update() { acl_signal_condvar(&l_acl_global_condvar); } +void acl_signal_device_update(acl_locking_data_t *locking_data) { + if (locking_data == nullptr) { + acl_signal_condvar(&l_acl_global_condvar); + } else { + acl_signal_condvar(&locking_data->condvar); + } +} #ifdef __linux__ @@ -139,9 +200,9 @@ static void l_init_once() { // a chance to execute. This function is useful for multithreaded hosts with // e.g. polling BSPs (using yield) to prevent one thread from hogging the mutex // while waiting for something like clFinish. -void acl_yield_lock_and_thread() { +void acl_yield_lock_and_thread(acl_locking_data_t *locking_data) { int lock_count; - lock_count = acl_suspend_lock(); + lock_count = acl_suspend_lock(locking_data); #ifdef __arm__ // arm-linux-gnueabihf-g++ version used is 4.7.1. // std::this_thread::yield can be enabled for it by defining @@ -152,5 +213,5 @@ void acl_yield_lock_and_thread() { #else std::this_thread::yield(); #endif - acl_resume_lock(lock_count); + acl_resume_lock(lock_count, locking_data); } diff --git a/src/acl_usm.cpp b/src/acl_usm.cpp index 94b8c6b4..f7834486 100644 --- a/src/acl_usm.cpp +++ b/src/acl_usm.cpp @@ -1071,7 +1071,7 @@ int acl_submit_usm_memcpy(cl_event event) { return result; } - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(event->context); acl_device_op_t *last_op = 0; // Precautionary, but it also nudges the device scheduler to try diff --git a/test/acl_command_queue_test.cpp b/test/acl_command_queue_test.cpp index 56f30750..909b9430 100644 --- a/test/acl_command_queue_test.cpp +++ b/test/acl_command_queue_test.cpp @@ -730,9 +730,9 @@ MT_TEST(acl_command_queue, mixed_queue_dependencies_1) { // Finish e1, see if e2 gets submitted CHECK_EQUAL(e1->last_device_op, e1->current_device_op); - ACL_LOCKED(acl_receive_kernel_update(e1->current_device_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, e1->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(context)); - ACL_LOCKED(acl_receive_kernel_update(e1->current_device_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, e1->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(context)); CHECK_EQUAL(CL_COMPLETE, e2->execution_status); // The important check @@ -806,9 +806,9 @@ MT_TEST(acl_command_queue, mixed_queue_dependencies_2) { // Finish e1, see if e2 gets submitted CHECK_EQUAL(e1->last_device_op, e1->current_device_op); - ACL_LOCKED(acl_receive_kernel_update(e1->current_device_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, e1->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(context)); - ACL_LOCKED(acl_receive_kernel_update(e1->current_device_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, e1->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(context)); CHECK_EQUAL(CL_COMPLETE, e2->execution_status); // The important check diff --git a/test/acl_context_test.cpp b/test/acl_context_test.cpp index 927f4596..a1283b32 100644 --- a/test/acl_context_test.cpp +++ b/test/acl_context_test.cpp @@ -1049,6 +1049,38 @@ MT_TEST(Context, compiler_mode) { } } +MT_TEST(Context, create_overlapping_contexts) { + cl_int status; + cl_context_properties valid_properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)m_platform, 0}; + + // device #0 should be on dev op queue 0 + status = CL_SUCCESS; + cl_context context0 = clCreateContext(valid_properties, 1, &m_device[0], 0, 0, &status); + CHECK_EQUAL(CL_SUCCESS, status); + + // devices #1 and #2 should be on dev op queue 1 + cl_context context1 = clCreateContext(valid_properties, 2, &m_device[1], 0, 0, &status); + CHECK_EQUAL(CL_SUCCESS, status); + + syncThreads(); + CHECK (acl_platform.physical_dev_id_to_doq_idx[0] != acl_platform.physical_dev_id_to_doq_idx[1]); + CHECK_EQUAL (acl_platform.physical_dev_id_to_doq_idx[1], acl_platform.physical_dev_id_to_doq_idx[2]); + + // now devices #0, #1, and #2 should all be on dev op queue 0 + cl_context context2 = clCreateContext(valid_properties, 3, &m_device[0], 0, 0, &status); + CHECK_EQUAL(CL_SUCCESS, status); + + syncThreads(); + CHECK_EQUAL (acl_platform.physical_dev_id_to_doq_idx[0], acl_platform.physical_dev_id_to_doq_idx[1]); + CHECK_EQUAL (acl_platform.physical_dev_id_to_doq_idx[0], acl_platform.physical_dev_id_to_doq_idx[2]); + + + clReleaseContext(context0); + clReleaseContext(context1); + clReleaseContext(context2); +} + MT_TEST(Context, offline_device) { cl_context_properties props[5]; // room enough to store two properties, their // values, and terminating NULL diff --git a/test/acl_device_op_test.cpp b/test/acl_device_op_test.cpp index e6c116fb..e96c8c80 100644 --- a/test/acl_device_op_test.cpp +++ b/test/acl_device_op_test.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -843,7 +844,7 @@ TEST(device_op, prune) { cl_event e0 = clCreateUserEvent(m_context, 0); CHECK(e0); - acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(m_context); acl_device_op_t *op0 = acl_propose_device_op(doq, ACL_DEVICE_OP_NONE, e0); acl_device_op_t *op1 = acl_propose_device_op(doq, ACL_DEVICE_OP_NONE, e0); diff --git a/test/acl_hal_test.cpp b/test/acl_hal_test.cpp index 03f752ab..200195e9 100644 --- a/test/acl_hal_test.cpp +++ b/test/acl_hal_test.cpp @@ -463,19 +463,19 @@ void acltest_call_event_update_callback(cl_event event, int new_status) { acltest_hal_event_callback(event, new_status); } -void acltest_call_kernel_update_callback(int activation_id, cl_int status) { - acltest_hal_kernel_callback(activation_id, status); +void acltest_call_kernel_update_callback(unsigned int physical_device_id, int activation_id, cl_int status) { + acltest_hal_kernel_callback(physical_device_id, activation_id, status); } -void acltest_call_device_update_callback(unsigned physical_device_id, +void acltest_call_device_update_callback(unsigned int physical_device_id, int device_status) { acltest_hal_device_callback(physical_device_id, (CL_EXCEPTION_TYPE_INTEL)device_status, NULL, 0); } -void acltest_call_printf_buffer_callback(int activation_id, int size, +void acltest_call_printf_buffer_callback(unsigned int physical_device_id, int activation_id, int size, int stalled) { - acltest_process_printf_buffer_callback(activation_id, size, stalled); + acltest_process_printf_buffer_callback(physical_device_id, activation_id, size, stalled); } void acltest_hal_launch_kernel( diff --git a/test/acl_hal_test.h b/test/acl_hal_test.h index a7e3949a..58c514b2 100644 --- a/test/acl_hal_test.h +++ b/test/acl_hal_test.h @@ -20,8 +20,8 @@ void acl_test_hal_set_physical_memory_support(bool value); extern bool acltest_hal_emulate_device_mem; void acltest_call_event_update_callback(cl_event event, int new_status); -void acltest_call_kernel_update_callback(int activation_id, cl_int status); -void acltest_call_printf_buffer_callback(int activation_id, int size, +void acltest_call_kernel_update_callback(unsigned int physical_device_id, int activation_id, cl_int status); +void acltest_call_printf_buffer_callback(unsigned int physical_device_id, int activation_id, int size, int stalled); #endif diff --git a/test/acl_kernel_test.cpp b/test/acl_kernel_test.cpp index c29272f6..e9222153 100644 --- a/test/acl_kernel_test.cpp +++ b/test/acl_kernel_test.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,7 @@ static void CL_CALLBACK test_debug_print(const char *errinfo, void *user_data); static const acl_device_op_t *l_find_op(int id) { - const acl_device_op_queue_t *doq = &acl_platform.device_op_queue; + const acl_device_op_queue_t *doq = get_device_op_queue(0); if (id >= 0 && id < doq->max_ops) { return doq->op + id; } @@ -68,7 +69,6 @@ MT_TEST_GROUP(acl_kernel) { void setup() { if (threadNum() == 0) { acl_test_setup_generic_system(); - acl_dot_push(&m_devlog, &acl_platform.device_op_queue); } syncThreads(); @@ -76,6 +76,10 @@ MT_TEST_GROUP(acl_kernel) { m_program = this->load_program(); this->build(m_program); + if (threadNum() == 0) { + acl_dot_push(&m_devlog, get_device_op_queue(0)); + } + // See acl_globals_test.cpp m_sample_kernel_name = "kernel0_copy_vecin_vecout"; m_sample_kernel_accel_id = 0; @@ -1369,7 +1373,9 @@ TEST(acl_kernel, enqueue_ndrange) { // The commit to the devcie op queue should have set the activation_id. CHECK_EQUAL(active_op->id, invocation->activation_id); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_RUNNING)); + // Not sure how to get actual physical device id that ran this kernel, + // so using device[0] + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); this->load_times(event, times); CHECK(times[1] < times[2]); @@ -1385,7 +1391,7 @@ TEST(acl_kernel, enqueue_ndrange) { // progress. CHECK_EQUAL(2, acl_ref_count(kernel)); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); this->load_times(event, times); CHECK(times[2] < times[3]); @@ -1859,9 +1865,9 @@ TEST(acl_kernel, enqueue_ndrange_workgroup_invariant_kernel) { // Fake completion of the task. ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_SUCCESS, clWaitForEvents(1, &event)); @@ -1946,7 +1952,7 @@ TEST(acl_kernel, enqueue_ndrange_workitem_invariant_kernel) { // Fake completion of the task. ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); // Since the ndrange is serialized to "global_size" tasks, the first // "global_size-1" completions should put the state back to running, and not @@ -1956,13 +1962,13 @@ TEST(acl_kernel, enqueue_ndrange_workitem_invariant_kernel) { int num_updates; ACL_LOCKED(acl_print_debug_msg("serilialized event:%d\n", (int)i)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(num_updates = acl_update_queue(m_cq)); CHECK_EQUAL(0, num_updates); } ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_SUCCESS, clWaitForEvents(1, &event)); ACL_LOCKED(acl_print_debug_msg("here 7\n")); @@ -2035,7 +2041,7 @@ TEST(acl_kernel, enqueue_task) { CHECK_EQUAL(0, times[3]); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); this->load_times(event, times); CHECK(times[1] < times[2]); @@ -2046,7 +2052,7 @@ TEST(acl_kernel, enqueue_task) { // not complete yet. CHECK_EQUAL(0, times[3]); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); this->load_times(event, times); CHECK(times[2] < times[3]); @@ -2171,10 +2177,10 @@ TEST(acl_kernel, local_arg_alloc) { // Fake completion of the task. acl_print_debug_msg(" set running\n"); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); acl_print_debug_msg(" set complete\n"); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); acl_print_debug_msg("wait for events\n"); CHECK_EQUAL(CL_SUCCESS, clWaitForEvents(1, &event)); @@ -2239,7 +2245,7 @@ TEST(acl_kernel, fast_launch_with_dependencies_ooo) { // previous is finished ACL_LOCKED( - acl_receive_kernel_update(event[0]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[0]->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_RUNNING, event[0]->execution_status); @@ -2247,14 +2253,14 @@ TEST(acl_kernel, fast_launch_with_dependencies_ooo) { CHECK_EQUAL(CL_SUBMITTED, event[2]->execution_status); // buffered on device ACL_LOCKED( - acl_receive_kernel_update(event[0]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[0]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_SUBMITTED, event[1]->execution_status); CHECK_EQUAL(CL_SUBMITTED, event[2]->execution_status); ACL_LOCKED( - acl_receive_kernel_update(event[1]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[1]->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event[2]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[2]->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_COMPLETE, event[0]->execution_status); @@ -2262,9 +2268,9 @@ TEST(acl_kernel, fast_launch_with_dependencies_ooo) { CHECK_EQUAL(CL_RUNNING, event[2]->execution_status); ACL_LOCKED( - acl_receive_kernel_update(event[1]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[1]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED( - acl_receive_kernel_update(event[2]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[2]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); clReleaseEvent(event[0]); @@ -2316,7 +2322,7 @@ TEST(acl_kernel, fast_launch_with_dependencies) { event[2]->execution_status); // stalled in the device_op_queue ACL_LOCKED( - acl_receive_kernel_update(event[0]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[0]->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_RUNNING, event[0]->execution_status); @@ -2325,10 +2331,10 @@ TEST(acl_kernel, fast_launch_with_dependencies) { event[2]->execution_status); // stalled in the device_op_queue ACL_LOCKED( - acl_receive_kernel_update(event[0]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[0]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(event[1]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[1]->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_COMPLETE, event[0]->execution_status); @@ -2336,13 +2342,13 @@ TEST(acl_kernel, fast_launch_with_dependencies) { CHECK_EQUAL(CL_SUBMITTED, event[2]->execution_status); // buffered on device ACL_LOCKED( - acl_receive_kernel_update(event[1]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[1]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(event[2]->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[2]->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(event[2]->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event[2]->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_COMPLETE, event[0]->execution_status); @@ -2414,14 +2420,14 @@ TEST(acl_kernel, multi_queue) { CHECK_EQUAL(CL_SUBMITTED, active_op->status); CHECK_EQUAL(CL_QUEUED, stalled_op->status); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // Still stalled CHECK_EQUAL(CL_RUNNING, active_op->status); CHECK_EQUAL(CL_QUEUED, stalled_op->status); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // Finally submitted @@ -2429,11 +2435,11 @@ TEST(acl_kernel, multi_queue) { CHECK_EQUAL(CL_COMPLETE, active_op->info.event->execution_status); CHECK_EQUAL(CL_SUBMITTED, stalled_op->status); - ACL_LOCKED(acl_receive_kernel_update(stalled_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, stalled_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_RUNNING, stalled_op->status); - ACL_LOCKED(acl_receive_kernel_update(stalled_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, stalled_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_COMPLETE, stalled_op->status); CHECK_EQUAL(CL_COMPLETE, stalled_op->info.event->execution_status); @@ -2498,14 +2504,14 @@ TEST(acl_kernel, multi_queue_with_fast_launch) { CHECK_EQUAL(CL_SUBMITTED, active_op->status); CHECK_EQUAL(CL_SUBMITTED, buffered_op->status); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // Still buffered CHECK_EQUAL(CL_RUNNING, active_op->status); CHECK_EQUAL(CL_SUBMITTED, buffered_op->status); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, active_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // Runtime still considers the event buffered until the kernel reports it is @@ -2514,11 +2520,11 @@ TEST(acl_kernel, multi_queue_with_fast_launch) { CHECK_EQUAL(CL_COMPLETE, active_op->info.event->execution_status); CHECK_EQUAL(CL_SUBMITTED, buffered_op->status); - ACL_LOCKED(acl_receive_kernel_update(buffered_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, buffered_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_RUNNING, buffered_op->status); - ACL_LOCKED(acl_receive_kernel_update(buffered_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0]->def.physical_device_id, buffered_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_COMPLETE, buffered_op->status); CHECK_EQUAL(CL_COMPLETE, buffered_op->info.event->execution_status); @@ -2605,7 +2611,7 @@ TEST(acl_kernel, two_task) { times[1]); // has been submitted, becase we were not waiting on anything. // Waiting for response from device to say kernel has started running. ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 @@ -2617,7 +2623,7 @@ TEST(acl_kernel, two_task) { times[2]); // The kernel is running, since the mem migration has happened. CHECK_EQUAL(0, times[3]); // not yet complete ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to COMPLETE + // submit KERNEL2 to device = 2 @@ -2646,14 +2652,14 @@ TEST(acl_kernel, two_task) { // Waiting for response from device to say kernel has started running. CHECK_EQUAL(0, times[3]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to RUNNING = 1 CHECK_EQUAL(offset + 12, m_devlog.num_ops); this->load_times(event, times); CHECK(times[1] < times[2]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to COMPLETE = 1 CHECK_EQUAL(offset + 13, m_devlog.num_ops); @@ -2730,7 +2736,7 @@ TEST(acl_kernel, two_task_with_fast_relaunch) { times[1]); // has been submitted, because we were not waiting on anything. // Waiting for response from device to say kernel has started running. ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 @@ -2743,7 +2749,7 @@ TEST(acl_kernel, two_task_with_fast_relaunch) { times[2]); // The kernel is running, since the mem migration has happened. CHECK_EQUAL(0, times[3]); // not yet complete ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to COMPLETE = 1 CHECK_EQUAL(offset + 11, m_devlog.num_ops); @@ -2771,14 +2777,14 @@ TEST(acl_kernel, two_task_with_fast_relaunch) { // Waiting for response from device to say kernel has started running. CHECK_EQUAL(0, times[3]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to RUNNING = 1 CHECK_EQUAL(offset + 12, m_devlog.num_ops); this->load_times(event, times); CHECK(times[1] < times[2]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to COMPLETE = 1 CHECK_EQUAL(offset + 13, m_devlog.num_ops); @@ -2867,7 +2873,7 @@ TEST(acl_kernel, fast_relaunch_with_subbuffer) { times[1]); // has been submitted, because we were not waiting on anything. // Waiting for response from device to say kernel has started running. ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 @@ -2880,7 +2886,7 @@ TEST(acl_kernel, fast_relaunch_with_subbuffer) { times[2]); // The kernel is running, since the mem migration has happened. CHECK_EQUAL(0, times[3]); // not yet complete ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to COMPLETE + // set MEM_MIGRATE2 to RUNNING + @@ -2913,14 +2919,14 @@ TEST(acl_kernel, fast_relaunch_with_subbuffer) { // Waiting for response from device to say kernel has started running. CHECK_EQUAL(0, times[3]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to RUNNING = 1 CHECK_EQUAL(offset + 12, m_devlog.num_ops); this->load_times(event, times); CHECK(times[1] < times[2]); ACL_LOCKED( - acl_receive_kernel_update(event2->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event2->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to COMPLETE = 1 CHECK_EQUAL(offset + 13, m_devlog.num_ops); @@ -2977,7 +2983,7 @@ TEST(acl_kernel, two_task_with_fast_relaunch_id_conflict) { CHECK_EQUAL(0, event1->is_on_device_op_queue); ACL_LOCKED( - acl_receive_kernel_update(event0->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event0->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(2, m_cq->num_commands); @@ -2988,7 +2994,7 @@ TEST(acl_kernel, two_task_with_fast_relaunch_id_conflict) { 0, event1->is_on_device_op_queue); // good shouldn't of been submitted ACL_LOCKED( - acl_receive_kernel_update(event0->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event0->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(1, m_cq->num_commands); @@ -2998,11 +3004,11 @@ TEST(acl_kernel, two_task_with_fast_relaunch_id_conflict) { CHECK_EQUAL(1, event1->is_on_device_op_queue); // now it is safe ACL_LOCKED( - acl_receive_kernel_update(event1->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event1->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(event1->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event1->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); @@ -3116,9 +3122,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_zero_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 2; CHECK_EQUAL(CL_INVALID_GLOBAL_WORK_SIZE, @@ -3134,9 +3140,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_zero_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 3; CHECK_EQUAL(CL_INVALID_GLOBAL_WORK_SIZE, @@ -3152,9 +3158,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_zero_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); @@ -3189,9 +3195,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_one_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 2; CHECK_EQUAL(CL_INVALID_GLOBAL_WORK_SIZE, @@ -3207,9 +3213,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_one_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 3; CHECK_EQUAL(CL_INVALID_GLOBAL_WORK_SIZE, @@ -3225,9 +3231,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_one_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); @@ -3264,9 +3270,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_two_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 2; CHECK_EQUAL(CL_SUCCESS, @@ -3274,9 +3280,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_two_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); work_dim = 3; CHECK_EQUAL(CL_INVALID_GLOBAL_WORK_SIZE, @@ -3292,9 +3298,9 @@ TEST(acl_kernel, enqueue_ndrange_max_global_work_dim_two_kernel) { local_size, 0, 0, &event)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(event->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0]->def.physical_device_id, event->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); @@ -3383,7 +3389,7 @@ TEST_GROUP(acl_kernel_reprogram_scheduler) { CHECK_EQUAL(0, m_device->last_bin); CHECK_EQUAL(0, m_device->loaded_bin); - acl_dot_push(&m_devlog, &acl_platform.device_op_queue); + acl_dot_push(&m_devlog, get_device_op_queue(m_device->def.physical_device_id)); } void unload(void) { @@ -3670,10 +3676,10 @@ TEST(acl_kernel_reprogram_scheduler, release_and_reprogram) { CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k0, 0, NULL, &k_e)); CHECK(k_e != NULL); - ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e)); CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k0)); @@ -3686,10 +3692,10 @@ TEST(acl_kernel_reprogram_scheduler, release_and_reprogram) { CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k1, 0, NULL, &k_e)); CHECK(k_e != NULL); - ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem)); CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e)); @@ -3795,7 +3801,7 @@ TEST(acl_kernel_reprogram_scheduler, require_reprogram) { // Pretend to start the kernel acl_print_debug_msg("Say kernel is running\n"); - ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_RUNNING)); CHECK_EQUAL(CL_RUNNING, k_e->current_device_op->execution_status); ACL_LOCKED(acl_idle_update(m_context)); @@ -3815,7 +3821,7 @@ TEST(acl_kernel_reprogram_scheduler, require_reprogram) { acl_print_debug_msg("Say kernel is complete\n"); ACL_LOCKED( - acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k_e->current_device_op->id, CL_COMPLETE)); CHECK_EQUAL(CL_COMPLETE, k_e->current_device_op->execution_status); ACL_LOCKED(acl_idle_update(m_context)); @@ -3830,7 +3836,8 @@ TEST(acl_kernel_reprogram_scheduler, require_reprogram) { CHECK_EQUAL(1, op3a->last_in_group); // Completion timestamp has propagated up to the user level event. - CHECK_EQUAL(acl_platform.device_op_queue.op[op3a->id].timestamp[CL_COMPLETE], + acl_device_op_queue_t *doq = get_device_op_queue(m_device[0].def.physical_device_id); + CHECK_EQUAL(doq->op[op3a->id].timestamp[CL_COMPLETE], k_e->timestamp[CL_COMPLETE]); // Completion wipes out the downlink. @@ -3943,9 +3950,9 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(1, op5->last_in_group); ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_COMPLETE)); // Count mem copies. num_read_mems = 0; @@ -3975,9 +3982,9 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { acl_test_setenv("ACL_PCIE_USE_JTAG_PROGRAMMING", "1"); ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_COMPLETE)); // Force the schedule update! ACL_LOCKED(acl_idle_update(m_context)); @@ -4054,9 +4061,9 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(1, op5->last_in_group); ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_COMPLETE)); // Force the schedule update! ACL_LOCKED(acl_idle_update(m_context)); @@ -4119,7 +4126,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { } ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 @@ -4132,7 +4139,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(1, op->last_in_group); ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to COMPLETE = 1 @@ -4204,7 +4211,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { // Pretend to complete the third kernel acl_print_debug_msg("Forcing kernel2 running\n"); ACL_LOCKED( - acl_receive_kernel_update(k2_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k2_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); expectedNumOps++; @@ -4218,7 +4225,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(1, op->last_in_group); ACL_LOCKED( - acl_receive_kernel_update(k2_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k2_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); expectedNumOps++; @@ -4264,14 +4271,14 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { // At this point all three events should be in the device_op_queue ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 // still can't execute reprogram CHECK_EQUAL(offset + 6, m_devlog.num_ops); ACL_LOCKED( - acl_receive_kernel_update(k0_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k0_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to COMPLETE + // submit REPROGRAM2 to device + @@ -4285,13 +4292,13 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(offset + 15, m_devlog.num_ops); ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING = 1 CHECK_EQUAL(offset + 16, m_devlog.num_ops); ACL_LOCKED( - acl_receive_kernel_update(k1_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k1_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL2 to COMPLETE + // submit REPROGRAM1 to device + <---This is the important one, must not be @@ -4302,10 +4309,10 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) { CHECK_EQUAL(offset + 25, m_devlog.num_ops); ACL_LOCKED( - acl_receive_kernel_update(k2_e->current_device_op->id, CL_RUNNING)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k2_e->current_device_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); ACL_LOCKED( - acl_receive_kernel_update(k2_e->current_device_op->id, CL_COMPLETE)); + acl_receive_kernel_update(m_device[0].def.physical_device_id, k2_e->current_device_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); // set KERNEL1 to RUNNING + // set KERNEL1 to COMPLETE = 2 @@ -4392,9 +4399,9 @@ TEST(acl_kernel_reprogram_scheduler, use_host_buf_use_twice_same_invocation) { CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 0, 0, &ke)); acl_device_op_t *active_op = ke->current_device_op; - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, active_op->id, CL_RUNNING)); ACL_LOCKED(acl_idle_update(m_context)); - ACL_LOCKED(acl_receive_kernel_update(active_op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, active_op->id, CL_COMPLETE)); ACL_LOCKED(acl_idle_update(m_context)); clReleaseEvent(ke); @@ -4458,7 +4465,7 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { } } // Say it's running. - ACL_LOCKED(acl_receive_kernel_update(op->id, CL_RUNNING)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, op->id, CL_RUNNING)); // Check that acl_receive_kernel_update does the right thing with the index. CHECK_EQUAL(CL_RUNNING, l_find_op(op->id)->execution_status); @@ -4478,7 +4485,8 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { CHECK_EQUAL(9, m_devlog.num_ops); op = &(m_devlog.before[8]); - CHECK_EQUAL(acl_platform.device_op_queue.op + op->id, ke->current_device_op); + acl_device_op_queue_t *doq = get_device_op_queue_from_context(m_context); + CHECK_EQUAL(doq->op + op->id, ke->current_device_op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); @@ -4499,17 +4507,17 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { *(((int *)printf_buf->block_allocation->range.begin) + 1) = printf_data; // Check operation of printf-pickup-scheduler call back function. // Activation id should be the device op id! - ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes, + ACL_LOCKED(acl_schedule_printf_buffer_pickup(m_device[0].def.physical_device_id, op->id, printf_bytes, 1 /*Debug printf dump*/)); CHECK_EQUAL( printf_bytes, - acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); + doq->op[op->id].info.num_printf_bytes_pending); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(10, m_devlog.num_ops); op = &(m_devlog.before[9]); - CHECK_EQUAL(acl_platform.device_op_queue.op + op->id, ke->current_device_op); + CHECK_EQUAL(doq->op + op->id, ke->current_device_op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); @@ -4530,17 +4538,17 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { // Check operation of printf-pickup-scheduler call back function. // Activation id should be the device op id! // Now we have two printf_data in the buffer, therefore the size is doubled. - ACL_LOCKED(acl_schedule_printf_buffer_pickup(op->id, printf_bytes * 2, + ACL_LOCKED(acl_schedule_printf_buffer_pickup(m_device[0].def.physical_device_id, op->id, printf_bytes * 2, 0 /*Not debug printf dump*/)); CHECK_EQUAL( printf_bytes * 2, - acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending); + doq->op[op->id].info.num_printf_bytes_pending); ACL_LOCKED(acl_idle_update(m_context)); CHECK_EQUAL(11, m_devlog.num_ops); op = &(m_devlog.before[10]); - CHECK_EQUAL(acl_platform.device_op_queue.op + op->id, ke->current_device_op); + CHECK_EQUAL(doq->op + op->id, ke->current_device_op); CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op->info.type); CHECK_EQUAL(CL_RUNNING, op->status); CHECK_EQUAL(ke, op->info.event); @@ -4558,13 +4566,12 @@ TEST(acl_kernel_reprogram_scheduler, printf_handler) { // Testing normal printf dump end // Say it's complete, but with some printf stuff to clean up. - ACL_LOCKED(acl_receive_kernel_update(op->id, CL_COMPLETE)); + ACL_LOCKED(acl_receive_kernel_update(m_device[0].def.physical_device_id, op->id, CL_COMPLETE)); // Check that acl_receive_kernel_update does the right thing with the index. CHECK_EQUAL(CL_COMPLETE, l_find_op(op->id)->execution_status); CHECK_EQUAL(CL_RUNNING, l_find_op(op->id)->status); // not copied up! // Set printf bytes which should be picked up - acl_platform.device_op_queue.op[op->id].info.num_printf_bytes_pending = - printf_bytes; + doq->op[op->id].info.num_printf_bytes_pending = printf_bytes; CHECK_EQUAL(0, op->info.debug_dump_printf); ACL_LOCKED(acl_idle_update(m_context)); // bump scheduler diff --git a/test/acl_profiler_test.cpp b/test/acl_profiler_test.cpp index e6370966..33200713 100644 --- a/test/acl_profiler_test.cpp +++ b/test/acl_profiler_test.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -60,11 +61,12 @@ MT_TEST_GROUP(acl_profile) { putenv(profiler_timer_env_var); #endif acl_test_setup_generic_system(); - acl_dot_push(&m_devlog, &acl_platform.device_op_queue); this->load(); m_program = this->load_program(); this->build(m_program); + + acl_dot_push(&m_devlog, get_device_op_queue_from_context(m_context)); } syncThreads(); } diff --git a/test/acl_test.cpp b/test/acl_test.cpp index 1a194255..5a1f9fda 100644 --- a/test/acl_test.cpp +++ b/test/acl_test.cpp @@ -683,8 +683,8 @@ static void l_run_benchmark() { int activation_id = kernel_event->cmd.info.ndrange_kernel .invocation_wrapper->image->activation_id; - acltest_call_kernel_update_callback(activation_id, CL_RUNNING); - acltest_call_kernel_update_callback(activation_id, CL_COMPLETE); + acltest_call_kernel_update_callback(device->def.physical_device_id, activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(device->def.physical_device_id, activation_id, CL_COMPLETE); status = clWaitForEvents(1, &kernel_event); assert(status == CL_SUCCESS); diff --git a/test/acl_thread_test.cpp b/test/acl_thread_test.cpp index 6577d4f1..3350dbc4 100644 --- a/test/acl_thread_test.cpp +++ b/test/acl_thread_test.cpp @@ -205,7 +205,7 @@ MT_TEST(acl_thread, kernel_and_printf_callback) { if (threadNum() < (int)device_def->accel.size()) { // signal kernel is running - acltest_call_kernel_update_callback(activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(sys_def->device[0].physical_device_id, activation_id, CL_RUNNING); // wait for kernel status to change cl_int execution_status = CL_QUEUED; @@ -220,7 +220,7 @@ MT_TEST(acl_thread, kernel_and_printf_callback) { if (threadNum() < (int)device_def->accel.size()) { // signal kernel has printf buffer - acltest_call_printf_buffer_callback(activation_id, 100, 0); + acltest_call_printf_buffer_callback(sys_def->device[0].physical_device_id, activation_id, 100, 0); // wait for printf buffer to be cleared CHECK(kernel_event); @@ -237,7 +237,7 @@ MT_TEST(acl_thread, kernel_and_printf_callback) { if (threadNum() < (int)device_def->accel.size()) { // signal kernel is finished - acltest_call_kernel_update_callback(activation_id, CL_COMPLETE); + acltest_call_kernel_update_callback(sys_def->device[0].physical_device_id, activation_id, CL_COMPLETE); status = clWaitForEvents(1, &kernel_event); CHECK_EQUAL(CL_SUCCESS, status);