From 125e1b17d8335351f6bee11f1d4514afaf603a72 Mon Sep 17 00:00:00 2001 From: yoann-heitz Date: Tue, 21 Sep 2021 06:19:02 -0400 Subject: [PATCH 1/5] allow fine-grained control on several APIs Deleted a line that makes rocprof abort when using fine grained control one several APIs at a time. The parsing function in libtracer_tool.so that parses the input XML file that specifies the functions for which the user wants to activate the tracing callbacks seems to handle correctly cases where fine-grained control is used on several APIs. This line can be safely deleted. --- test/tool/tool.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 4bdce5dd..3773d6df 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -1108,7 +1108,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Getting traces const auto traces_list = xml->GetNodes("top.trace"); - if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); const unsigned feature_count = metrics_vec.size() + traces_list.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; From 2b0aab57b1b2cd0006de368d7d2272b75e1d6c8c Mon Sep 17 00:00:00 2001 From: yoann-heitz Date: Wed, 22 Sep 2021 07:44:35 -0400 Subject: [PATCH 2/5] Isolate flushing instructions for metrics Isolated flushing instructions into a wrapping function. The file descriptor used for flushing is the one that is globally defined and not the one stored in the context_entry_t object as it is always the one that is defined globally that is used. If using a globally defined file descriptor is problematic (however it is the norm in the libtracer_tool.so library in ROCTracer) rather than passing it through arguments, the wrapping flushing function can be modified to take a void pointer. The file descriptor can then be passed through this pointer and be used in the default flushing function. --- test/tool/tool.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 3773d6df..6cac8f20 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -337,19 +337,21 @@ unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } +void metric_flush_cb(const char *name, uint64_t result){ + fprintf(result_file_handle, " %s ", name); + fprintf(result_file_handle, "(%lu)\n", result); +} // Output profiling results for input features void output_results(const context_entry_t* entry, const char* label) { - FILE* file = entry->file_handle; const rocprofiler_feature_t* features = entry->features; const unsigned feature_count = entry->feature_count; for (unsigned i = 0; i < feature_count; ++i) { const rocprofiler_feature_t* p = &features[i]; - fprintf(file, " %s ", p->name); switch (p->data.kind) { // Output metrics results case ROCPROFILER_DATA_KIND_INT64: - fprintf(file, "(%lu)\n", p->data.result_int64); + metric_flush_cb(p->name, p->data.result_int64); break; default: fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); From 5ccee80b5b32f7cd6c01b68134f652f9c454df4a Mon Sep 17 00:00:00 2001 From: yoann-heitz Date: Wed, 22 Sep 2021 08:16:59 -0400 Subject: [PATCH 3/5] Isolate flushing instructions for kernel events Isolated flushing instructions into a wrapping function. The file descriptor used for flushing is the one that is globally defined and not the one stored in the context_entry_t object as it is always the one that is defined globally that is used. If using a globally defined file descriptor is problematic (however it is the norm in the libtracer_tool.so library in ROCTracer) rather than passing it through arguments, the wrapping flushing function can be modified to take a void pointer. The file descriptor can then be passed through this pointer and be used in the default flushing function. --- test/tool/tool.cpp | 66 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 6cac8f20..8c9267e8 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -373,6 +373,53 @@ void output_group(const context_entry_t* entry, const char* label) { } } +void kernel_flush_cb( uint32_t dispatch, + uint32_t gpu_id, + uint32_t queue_id, + uint64_t queue_index, + uint32_t pid, + uint32_t tid, + uint32_t grid_size, + uint32_t workgroup_size, + uint32_t lds_size, + uint32_t scratch_size, + uint32_t vgpr, + uint32_t sgpr, + uint32_t fbarrier_count, + uint64_t signal_handle, + uint64_t object, + const char* kernel_name, + bool record, + uint64_t dispatch_time, + uint64_t begin, + uint64_t end, + uint64_t complete){ + fprintf(result_file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", + dispatch, + gpu_id, + queue_id, + queue_index, + pid, + tid, + grid_size, + workgroup_size, + lds_size, + scratch_size, + vgpr, + sgpr, + fbarrier_count, + signal_handle, + object, + kernel_name); + if (record) fprintf(result_file_handle, ", time(%lu,%lu,%lu,%lu)", + dispatch_time, + begin, + end, + complete); + fprintf(result_file_handle, "\n"); + fflush(result_file_handle); +} + // Dump stored context entry bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { hsa_status_t status = HSA_STATUS_ERROR; @@ -391,12 +438,9 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { const uint32_t index = entry->index; if (index != UINT32_MAX) { - FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", - index, + kernel_flush_cb(index, agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, @@ -411,14 +455,12 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { entry->kernel_properties.fbarrier_count, entry->kernel_properties.signal.handle, entry->kernel_properties.object, - nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", - record->dispatch, - record->begin, - record->end, - record->complete); - fprintf(file_handle, "\n"); - fflush(file_handle); + nik_name.c_str(), + record != NULL ? true : false, + record != NULL ? record->dispatch: 0, + record != NULL ? record->begin : 0, + record != NULL ? record->end : 0, + record != NULL ? record->complete : 0); } if (record && to_clean) { delete record; From 65161e2763be13042b1f99dd2e74f10e623efeb2 Mon Sep 17 00:00:00 2001 From: yoann-heitz Date: Thu, 23 Sep 2021 06:41:41 -0400 Subject: [PATCH 4/5] Update metrics flushing function signature The signature of the function that flush metrics has been updated in order to have a standard signature for all the flushing functions. A new data structure that is passed as the only one argument for the function has also been defined --- test/tool/tool.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 8c9267e8..374c10d7 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -337,9 +337,15 @@ unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } -void metric_flush_cb(const char *name, uint64_t result){ - fprintf(result_file_handle, " %s ", name); - fprintf(result_file_handle, "(%lu)\n", result); +struct metric_trace_entry_t { + uint32_t dispatch; + const char* name; + uint64_t result; +}; + +void metric_flush_cb(metric_trace_entry_t *entry){ + fprintf(result_file_handle, " %s ", entry->name); + fprintf(result_file_handle, "(%lu)\n", entry->result); } // Output profiling results for input features void output_results(const context_entry_t* entry, const char* label) { @@ -350,8 +356,10 @@ void output_results(const context_entry_t* entry, const char* label) { const rocprofiler_feature_t* p = &features[i]; switch (p->data.kind) { // Output metrics results - case ROCPROFILER_DATA_KIND_INT64: - metric_flush_cb(p->name, p->data.result_int64); + case ROCPROFILER_DATA_KIND_INT64: { + metric_trace_entry_t metric_trace_entry = {entry->index, p->name, p->data.result_int64}; + metric_flush_cb(&metric_trace_entry); + } break; default: fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); From 24bf30f1fb9c9215cffb318e8e998b228009da46 Mon Sep 17 00:00:00 2001 From: yoann-heitz Date: Thu, 23 Sep 2021 06:42:15 -0400 Subject: [PATCH 5/5] Update kernel flushing function signature As for the metrics, the kernel flushing function signature has been updated and a new data structure has been defined --- test/tool/tool.cpp | 95 ++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 374c10d7..06d11c13 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -381,49 +381,53 @@ void output_group(const context_entry_t* entry, const char* label) { } } -void kernel_flush_cb( uint32_t dispatch, - uint32_t gpu_id, - uint32_t queue_id, - uint64_t queue_index, - uint32_t pid, - uint32_t tid, - uint32_t grid_size, - uint32_t workgroup_size, - uint32_t lds_size, - uint32_t scratch_size, - uint32_t vgpr, - uint32_t sgpr, - uint32_t fbarrier_count, - uint64_t signal_handle, - uint64_t object, - const char* kernel_name, - bool record, - uint64_t dispatch_time, - uint64_t begin, - uint64_t end, - uint64_t complete){ +struct kernel_trace_entry_t { + uint32_t dispatch; + uint32_t gpu_id; + uint32_t queue_id; + uint64_t queue_index; + uint32_t pid; + uint32_t tid; + uint32_t grid_size; + uint32_t workgroup_size; + uint32_t lds_size; + uint32_t scratch_size; + uint32_t vgpr; + uint32_t sgpr; + uint32_t fbarrier_count; + uint64_t signal_handle; + uint64_t object; + const char* kernel_name; + bool record; + uint64_t dispatch_time; + uint64_t begin; + uint64_t end; + uint64_t complete; +}; + +void kernel_flush_cb(kernel_trace_entry_t* entry){ fprintf(result_file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", - dispatch, - gpu_id, - queue_id, - queue_index, - pid, - tid, - grid_size, - workgroup_size, - lds_size, - scratch_size, - vgpr, - sgpr, - fbarrier_count, - signal_handle, - object, - kernel_name); - if (record) fprintf(result_file_handle, ", time(%lu,%lu,%lu,%lu)", - dispatch_time, - begin, - end, - complete); + entry->dispatch, + entry->gpu_id, + entry->queue_id, + entry->queue_index, + entry->pid, + entry->tid, + entry->grid_size, + entry->workgroup_size, + entry->lds_size, + entry->scratch_size, + entry->vgpr, + entry->sgpr, + entry->fbarrier_count, + entry->signal_handle, + entry->object, + entry->kernel_name); + if (entry->record) fprintf(result_file_handle, ", time(%lu,%lu,%lu,%lu)", + entry->dispatch_time, + entry->begin, + entry->end, + entry->complete); fprintf(result_file_handle, "\n"); fflush(result_file_handle); } @@ -448,10 +452,10 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { if (index != UINT32_MAX) { const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - kernel_flush_cb(index, + kernel_trace_entry_t kernel_trace_entry = {entry->index, agent_info->dev_index, entry->data.queue_id, - entry->data.queue_index, + entry->data.queue_index, my_pid, entry->data.thread_id, entry->kernel_properties.grid_size, @@ -468,7 +472,8 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { record != NULL ? record->dispatch: 0, record != NULL ? record->begin : 0, record != NULL ? record->end : 0, - record != NULL ? record->complete : 0); + record != NULL ? record->complete : 0}; + kernel_flush_cb(&kernel_trace_entry); } if (record && to_clean) { delete record;