diff --git a/CMakeLists.txt b/CMakeLists.txt index 036b39fd..eda19fba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,8 @@ cmake_minimum_required ( VERSION 2.8.12 ) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix default") +list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -207,7 +209,7 @@ else() endif() message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-core" ) ## Process the Debian install/remove scripts to update the CPACK variables configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) @@ -234,12 +236,17 @@ if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) endif() set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) -set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-core" ) ## Process the Rpm install/remove scripts to update the CPACK variables configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) +# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +endif() include ( CPack ) diff --git a/README.md b/README.md new file mode 100644 index 00000000..e83b410c --- /dev/null +++ b/README.md @@ -0,0 +1,199 @@ +# ROC-profiler +ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. + +HW specific low-level performance analysis interface for profiling of GPU compute applications. The +profiling includes HW performance counters with complex performance metrics. + +To use the rocProfiler API you need the API header and to link your application with roctracer .so librray: + - the API header: /opt/rocm/rocprofiler/include/rocprofiler.h + - the .so library: /opt/rocm/lib/librocprofiler64.so + +## Documentation +- ['rocprof' cmdline tool specification](doc/rocprof.md) +- ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) + +## Metrics +[The link to profiler default metrics XML specification](test/tool/metrics.xml) + + +## Source tree +``` + - bin + - rocprof - Profiling tool run script + - doc - Documentation + - inc/rocprofiler.h - Library public API + - src - Library sources + - core - Library API sources + - util - Library utils sources + - xml - XML parser + - test - Library test suite + - tool - Profiling tool + - tool.cpp - tool sources + - metrics.xml - metrics config file + - ctrl - Test controll + - util - Test utils + - simple_convolution - Simple convolution test kernel +``` + +## Build environment: +``` + export CMAKE_PREFIX_PATH=: + export CMAKE_BUILD_TYPE= # release by default + export CMAKE_DEBUG_TRACE=1 # to enable debug tracing +``` + +## To build with the current installed ROCM: +``` + - ROCm is required. + ROCr-runtime and roctracer are needed + + - Python is required. + The required modules: CppHeaderParser, argparse, sqlite3 + To install: + sudo pip install CppHeaderParser argparse sqlite3 + + - To build and install to /opt/rocm/rocprofiler + Please use release branches/tags of 'amd-master' branch for development version. + + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + + cd .../rocprofiler + ./build.sh +``` + +## Internal 'simple_convolution' test run script: +``` + cd .../rocprofiler/build + make mytest + run.sh +``` + +## To enable error messages logging to '/tmp/rocprofiler_log.txt': +``` + export ROCPROFILER_LOG=1 +``` + +## To enable verbose tracing: +``` + export ROCPROFILER_TRACE=1 +``` + +## Profiling utility usage: +``` +rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize + # Perf counters group 2 + pmc : VALUUtilization,WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + The output CSV file columns meaning in the columns order: + Index - kernels dispatch order index + KernelName - the dispatched kernel name + gpu-id - GPU id the kernel was submitted to + queue-id - the ROCm queue unique id the kernel was submitted to + queue-index - The ROCm queue write index for the submitted AQL packet + tid - system application thread id which submitted the kernel + grd - the kernel's grid size + wgr - the kernel's work group size + lds - the kernel's LDS memory size + scr - the kernel's scratch memory size + vgpr - the kernel's VGPR size + sgpr - the kernel's SGPR size + fbar - the kernel's barriers limitation + sig - the kernel's completion signal + ... - The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs - timestamp columns if time-stamping was enabled + + -d - directory where profiler store profiling data including thread treaces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] + Four kernel timestamps in nanoseconds are reported: + DispatchNs - the time when the kernel AQL dispatch packet was written to the queue + BeginNs - the kernel execution begin time + EndNs - the kernel execution end time + CompleteNs - the time when the completion signal of the AQL dispatch packet was received + + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [on] + To support V3 code-object. + + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` + + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. diff --git a/Readme.txt b/Readme.txt deleted file mode 100644 index 9008165a..00000000 --- a/Readme.txt +++ /dev/null @@ -1,54 +0,0 @@ -ROC Profiler library. -Profiling with metrics and traces based on perfcounters (PMC) and traces (SPM). -Implementation is based on AqlProfile HSA extension. -Library supports GFX8/GFX9. - -The library source tree: - - doc - Documentation - - inc/rocprofiler.h - Library public API - - src - Library sources - - core - Library API sources - - util - Library utils sources - - xml - XML parser - - test - Library test suite - - ctrl - Test controll - - util - Test utils - - simple_convolution - Simple convolution test kernel - -Build environment: - -$ export CMAKE_PREFIX_PATH=: -$ export CMAKE_BUILD_TYPE= # release by default -$ export CMAKE_DEBUG_TRACE=1 # 1 to enable debug tracing - -To build with the current installed ROCM: - -$ cd .../rocprofiler -$ export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm -$ mkdir build -$ cd build -$ cmake .. -$ make - -To run the test: - -$ cd .../rocprofiler/build -$ export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries -$ export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime -$ export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler -$ export ROCP_METRICS=metrics.xml # ROC profiler metrics config file -$ export ROCP_INPUT=input.xml # input file for the tool library -$ export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' and trace files -$ - -Internal 'simple_convolution' test run script: -$ cd .../rocprofiler/build -$ run.sh - -To enabled error messages logging to '/tmp/rocprofiler_log.txt': - -$ export ROCPROFILER_LOG=1 - -To enable verbose tracing: - -$ export ROCPROFILER_TRACE=1 diff --git a/doc/rocprof.md b/doc/rocprof.md new file mode 100644 index 00000000..3b4c9f99 --- /dev/null +++ b/doc/rocprof.md @@ -0,0 +1,393 @@ +# rocprof +## 1. Overview +The rocProf is a command line tool implemented on the top of rocProfiler and rocTracer APIs. Source code for rocProf may be found here: +GitHub: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/bin/rocprof +This command line tool is implemented as a script which is setting up the environment for attaching the profiler and then run the provided application command line. The tool uses two profiling plugins loaded by ROC runtime and based on rocProfiler and rocTracer for collecting metrics/counters, HW traces and runtime API/activity traces. The tool consumes an input XML or text file with counters list or trace parameters and provides output profiling data and statistics in various formats as text, CSV and JSON traces. Google Chrome tracing can be used to visualize the JSON traces with runtime API/activity timelines and per kernel counters data. +## 2. Profiling Modes +‘rocprof’ can be used for GPU profiling using HW counters and application tracing +### 2.1. GPU profiling +GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using option ‘-i ’. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter values. By option ‘—stats’ the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. +An example of input file: +``` + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts + # Perf counters group 2 + pmc : TCC_HIT[0], TCC_MISS[0] + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 +``` +An example of profiling command line for ‘MatrixTranspose’ application +``` +$ rocprof -i input.txt MatrixTranspose +RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' +RPL: profiling '"./MatrixTranspose"' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_011134_9695' +RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" + gpu_index = + kernel = + range = + 4 metrics + L2CacheHit, VFetchInsts, VWriteInsts, MemUnitStalled + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] +PASSED! + +ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 +RPL: '/…./MatrixTranspose/input.csv' is generated +``` +#### 2.1.1. Counters and metrics +There are two profiling features, metrics and traces. Hardware performance counters are treated as the basic metrics and the formulas can be defined for derived metrics. +Counters and metrics can be dynamically configured using XML configuration files with counters and metrics tables: + - Counters table entry, basic metric: counter name, block name, event id + - Derived metrics table entry: metric name, an expression for calculation the metric from the counters + +Metrics XML File Example: +``` + + + + . . . + + + + . . . + + + + + +``` +##### 2.1.1.1. Metrics query +Available counters and metrics can be queried by options ‘—list-basic’ for counters and ‘—list-derived’ for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. +Examples: +``` +$ rocprof --list-basic +RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCProfiler: rc-file '/…./rpl_rc.xml' +Basic HW counters: + gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks + block GRBM has 2 counters + gpu-agent0 : GRBM_GUI_ACTIVE : The GUI is Active + block GRBM has 2 counters + . . . + gpu-agent0 : TCC_HIT[0-15] : Number of cache hits. + block TCC has 4 counters + gpu-agent0 : TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + block TCC has 4 counters + . . . + +$ rocprof --list-derived +RPL: on '191018_015911' from '/opt/rocm/rocprofiler' in '/home/evgeny/work/BUILD/0_MatrixTranspose' +ROCProfiler: rc-file '/home/evgeny/rpl_rc.xml' +Derived metrics: + gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + TCC_HIT_sum = sum(TCC_HIT,16) + gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + TCC_MISS_sum = sum(TCC_MISS,16) + gpu-agent0 : TCC_MC_RDREQ_sum : Number of 32-byte reads. Sum over TCC instaces. + TCC_MC_RDREQ_sum = sum(TCC_MC_RDREQ,16) + . . . +``` +##### 2.1.1.2. Metrics collecting +Counters and metrics accumulated per kernel can be collected using input file with a list of metrics, see an example in 2.1. +Currently profiling has limitation of serializing submitted kernels. +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. +###### 2.1.1.2.1. Blocks instancing +GPU blocks are implemented as several identical instances. To dump counters of specific instance square brackets can be used, see an example in 2.1. +The number of block instances can be queried, see 2.1.1.1. +###### 2.1.1.2.2. HW limitations +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. + - Metrics groups + +To dump a list of metrics exceeding HW limitations the metrics list can be split on groups. +The tool supports automatic splitting on optimal metric groups: +``` +$ rocprof -i input.txt ./MatrixTranspose +RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +RPL: profiling './MatrixTranspose' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_032645_12106' +RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" + gpu_index = + kernel = + range = + 20 metrics + Wavefronts, VALUInsts, SALUInsts, SFetchInsts, FlatVMemInsts, LDSInsts, FlatLDSInsts, GDSInsts, VALUUtilization, FetchSize, WriteSize, L2CacheHit, VWriteInsts, GPUBusy, VALUBusy, SALUBusy, MemUnitStalled, WriteUnitStalled, LDSBankConflict, MemUnitBusy + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] + +Input metrics out of HW limit. Proposed metrics group set: + group1: L2CacheHit VWriteInsts MemUnitStalled WriteUnitStalled MemUnitBusy FetchSize FlatVMemInsts LDSInsts VALUInsts SALUInsts SFetchInsts FlatLDSInsts GPUBusy Wavefronts + group2: WriteSize GDSInsts VALUUtilization VALUBusy SALUBusy LDSBankConflict + +ERROR: rocprofiler_open(), Construct(), Metrics list exceeds HW limits + +Aborted (core dumped) +Error found, profiling aborted. +``` + - Collecting with multiple runs + +To collect several metric groups a full application replay is used by defining several ‘pmc:’ lines in the input file, see 2.1. + +### 2.2. Application tracing +Supported application tracing includes runtime API and GPU activity tracing’ +Supported runtimes are: ROCr (HSA API) and HIP +Supported GPU activity: kernel execution, async memory copy, barrier packets. +The trace is generated in JSON format compatible with Chrome tracing. +The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and parameters. +Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, where ‘sys trace’ is for HIP and HSA combined trace. +#### 2.2.1. HIP runtime trace +The trace is generated by option ‘—hip-trace’ and includes HIP API timelines and GPU activity at the runtime level. +#### 2.2.2. ROCr runtime trace +The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. +#### 2.2.3. KFD driver trace +The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API timeline. +It is planned to add memory allocations/migration tracing. +#### 2.2.4. Code annotation +Support for application code annotation. +Start/stop API is supported to programmatically control the profiling. +A ‘roctx’ library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. +##### 2.2.4.1. Start/stop API +``` +// Tracing start API +void roctracer_start(); + +// Tracing stop API +void roctracer_stop(); +``` +##### 2.2.4.2. rocTX basic markers API +``` +// A marker created by given ASCII massage +void roctxMark(const char* message); + +// Returns the 0 based level of a nested range being started by given message associated to this range. +// A negative value is returned on the error. +int roctxRangePush(const char* message); + +// Marks the end of a nested range. +// Returns the 0 based level the range. +// A negative value is returned on the error. +int roctxRangePop(); +``` +### 2.3. Multiple GPUs profiling +The profiler supports multiple GPU’s profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. +## 3. Profiling control +Profiling can be controlled by specifying a profiling scope, by filtering trace events and specifying interesting time intervals. +### 3.1. Profiling scope +Counters profiling scope can be specified by GPU id list, kernel name substrings list and dispatch range. +Supported range formats examples: "3:9", "3:", "3". You can see an example of input file in 2.1. +#### 3.2. Tracing control +Tracing can be filtered by events names using profiler input file and by enabling interesting time intervals by command line option. +#### 3.2.1. Filtering traced APIs +A list of traced API names can be specified in profiler input file. +An example of input file line for ROCr runtime trace (HAS API): +``` +hsa: hsa_queue_create hsa_amd_memory_pool_allocate +``` +#### 3.2.2. Tracing time period +Trace can be dumped periodically with initial delay, dumping period length and rate: +``` +--trace-period +``` +### 3.3. Concurrent kernels +Currently concurrent kernels profiling is not supported which is a planned feature. Kernels are serialized. +### 3.4. Multi-processes profiling +Multi-processes profiling is not currently supported. +### 3.5. Errors logging +Profiler errors are logged to global logs: +``` +/tmp/aql_profile_log.txt +/tmp/rocprofiler_log.txt +/tmp/roctracer_log.txt +``` +## 4. 3rd party visualization tools +‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. +### 4.1. Chrome tracing +Good review can be found by the link: https://aras-p.info/blog/2017/01/23/Chrome-Tracing-as-Profiler-Frontend/ +## 5. Command line options +The command line options can be printed with option ‘-h’: +``` +$ rocprof -h +RPL: on '191018_023018' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package. +Full path: /opt/rocm/rocprofiler/bin/rocprof +Metrics definition: /opt/rocm/rocprofiler/lib/metrics.xml + +Usage: + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including traces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-wait - to wait for outstanding contexts on profiler exit [on] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] + + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` +## 6. Publicly available counters and metrics +The following counters are publicly available for commercially available VEGA10/20 GPUs. + +Counters: +``` +• GRBM_COUNT : Tie High - Count Number of Clocks +• GRBM_GUI_ACTIVE : The GUI is Active +• SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) +• SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) +• SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) +• SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) +• SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) +• SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) +• SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) +• SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) +• SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) +• SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) +• TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. +• TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. +• TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. +• TCC_HIT[0-15] : Number of cache hits. +• TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. +• TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. +• TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. +• TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. +• TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) +• TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests +• TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. +``` + +The following derived metrics have been defined and the profiler metrics XML specification can be found at: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/test/tool/metrics.xml. + +Metrics: +``` +• TA_BUSY_avr : TA block is busy. Average over TA instances. +• TA_BUSY_max : TA block is busy. Max over TA instances. +• TA_BUSY_min : TA block is busy. Min over TA instances. +• TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. +• TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. +• TCC_HIT_sum : Number of cache hits. Sum over TCC instances. +• TCC_MISS_sum : Number of cache misses. Sum over TCC instances. +• TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. +• TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. +• TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. +• TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. +• FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• GPUBusy : The percentage of time GPU was busy. +• Wavefronts : Total wavefronts. +• VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). +• SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). +• VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. +• SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). +• VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. +• FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. +• LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. +• FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). +• GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). +• VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). +• VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• Mem32Bwrites : +• FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). +• MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). +• MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). +• WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). +• ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). +• LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). +``` diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md new file mode 100644 index 00000000..975d58ca --- /dev/null +++ b/doc/rocprofiler_spec.md @@ -0,0 +1,837 @@ +# ROC Profiler Library Specification +ROC Profiler API version 7 + +## 1. High level overview +``` +The goal of the implementation is to provide a HW specific low-level performance analysis +interface for profiling of GPU compute applications. The profiling includes HW performance +counters with complex performance metrics and HW traces. The implementation distinguishes +two profiling features, metrics and traces. HW performance counters are treated as the basic +metrics and the formulas can be defined for derived complex metrics. +The library can be loaded by HSA runtime as a tool plugin and it can be loaded by higher +level HW independent performance analysis API like PAPI. +The library has C API and is based on AQLprofile AMD specific HSA extension. + + 1. The library provides methods to query the list of supported HW features. + 2. The library provides profiling APIs to start, stop, read metrics results and tracing + data. + 3. The library provides a intercepting API for collecting per-kernel profiling data for + the kernels + dispatched to HSA AQL queues. + 4. The library provides mechanism to load profiling tool library plugin by env variable + ROCP_TOOL_LIB. + 5. The library is responsible for allocation of the buffers for profiling and notifying + about output data buffer overflow for traces. + 6. The library is implemented based on AMD specific AQLprofile HSA extension. + 7. The library implementation is abstracted from the specific GFXIP. + 8. The library implementation is extensible: + - Easy adding of counters and metrics + - Counters enumeration + - Counters and metrics can be dynamically configured using XML configuration files with + counters and metrics tables: + o Counters table entry, basic metric: counter name, block name, event id + o Complex metrics table entry: metric name, an expression for calculation the metric + from the counters + +Metrics XML file example: + + + + . . . + + + + . . . + + + + + +``` +## 2. Environment +``` +* HSA_TOOLS_LIB - required to be set to the name of rocprofiler library to be loaded by +HSA runtime +* ROCP_METRICS - path to the metrics XML file +* ROCP_TOOL_LIB - path to profiling tool library loaded by ROC Profiler +* ROCP_HSA_INTERCEPT - if set then HSA dispatches intercepting is enabled +``` +## 3. General API +### 3.1. Description +``` +The library supports method for getting the error number and error string of the last +failed library API call. +To check the conformance of used library APi header and the library binary the version +macros and API methods can be used. + +Returning the error and error string methods: +- rocprofiler_error_string - method for returning the error string + +Library version: +- ROCPROFILER_VERSION_MAJOR - API major version macro +- ROCPROFILER_VERSION_MINOR - API minor version macro +- rocprofiler_version_major - library major version +- rocprofiler_version_minor - library minor version +``` +### 3.2. Returning the error and error string methods +``` +const char* rocprofiler_error_string(); +``` +### 3.3. Library version +``` +The library provides back compatibility if the library major version is less or equal +then the API major version macro. + +API version macros defined in the library API header 'rocprofiler.h': + +ROCPROFILER_VERSION_MAJOR +ROCPROFILER_VERSION_MINOR + +Methods to check library major and minor venison: + +uint32_t rocprofiler_major_version(); +uint32_t rocprofiler_minor_version(); +``` +## 4. Backend API +### 4.1. Description +``` +The library provides the methods to open/close profiling context, to start, stop and read +HW performance counters and traces, to intercept kernel dispatches to collect per-kernel +profiling data. Also the library provides methods to calculate complex performance metrics +and to query the list of available metrics. The library distinguishes two profiling features, +metrics and traces, where HW performance counters are treated as the basic metrics. To check +if there was an error the library methods return HSA standard status code. +For a given context the profiling can be started/stopped and counters sampled in standalone +mode or profiling can be initiated by intercepting the kernel dispatches with registering +a dispatch callback. +For counters sampling, which is the usage model of higher level APIs like PAPI, +the start/stop/read APIs should be used. +For collecting per-kernel data for the submitted to HSA queues kernels the dispatch callback +API should be used. +The library provides back compatibility if the library major version is less or equal. + +Returned API status: +- hsa_status_t - HSA status codes are used from hsa.h header + +Loading and Configuring, loadable plugin on-load/unload methods: +- rocprofiler_settings_t – global properties +- OnLoadTool +- OnLoadToolProp +- OnUnloadTool + +Info API: +- rocprofiler_info_kind_t - profiling info kind +- rocprofiler_info_query_t - profiling info query +- rocprofiler_info_data_t - profiling info data +- rocprofiler_get_info - return the info for a given info kind +- rocprofiler_iterote_inf_ - iterate over the info for a given info kind +- rocprofiler_query_info - iterate over the info for a given info query + +Context API: +- rocprofiler_t - profiling context handle +- rocprofiler_feature_kind_t - profiling feature kind +- rocprofiler_feature_parameter_t - profiling feature parameter +- rocprofiler_data_kind_t - profiling data kind +- rocprofiler_data_t - profiling data +- rocprofiler_feature_t - profiling feature +- rocprofiler_mode_t - profiling modes +- rocprofiler_properties_t - profiler properties +- rocprofiler_open - open new profiling context +- rocprofiler_close - close profiling context and release all allocated resources +- rocprofiler_group_count - return profiling groups count +- rocprofiler_get_group - return profiling group for a given index +- rocprofiler_get_metrics - method for calculating the metrics data +- rocprofiler_iterate_trace_data - method for iterating output trace data instances +- rocprofiler_time_id_t - supported time value ID enumeration +- rocprofiler_get_time – return time for a given time ID and profiling timestamp value + +Sampling API: +- rocprofiler_start - start profiling +- rocprofiler_stop - stop profiling +- rocprofiler_read - read profiling data to the profiling features objects +- rocprofiler_get_data - wait for profiling data + Group versions of start/stop/read/get_data methods: + o rocprofiler_group_start + o rocprofiler_group_stop + o rocprofiler_group_read + o rocprofiler_group_get_data + +Intercepting API: +- rocprofiler_callback_t - profiling callback type +- rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_dispatch_record_t – dispatch record +- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy +- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks +- rocprofiler_remove_queue_callbacks - remove queue callbacks + +Context pool API: +- rocprofiler_pool_t – context pool handle +- rocprofiler_pool_entry_t – context pool entry +- rocprofiler_pool_properties_t – context pool properties +- rocprofiler_pool_handler_t – context pool completion handler +- rocprofiler_pool_open - context pool open +- rocprofiler_pool_close - context pool close +- rocprofiler_pool_fetch – fetch and empty context entry to pool +- rocprofiler_pool_release – release a context entry +- rocprofiler_pool_iterate – iterated fetched context entries +- rocprofiler_pool_flush – flush completed context entries +``` +### 4.2. Loading and Configuring +``` +Loading and Configuring +The profiling properties can be set by profiler plugin on loading by ROC runtime. +The profiler library plugin can be set by ROCP_TOOL_LIB env var. + +Global properties: + +typedef struct { + uint32_t intercept_mode; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +On load/unload methods defined in profiling tool library loaded by ROCP_TOOL_LIB env var: +extern "C" void OnLoadTool(); +extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings); +extern "C" void OnUnloadTool(); + +``` +### 4.3. Info API +``` +The profiling metrics are defined by name and the traces are defined by name and parameters. +All supported features can be iterated using 'iterate_info/query_info' methods. The counter +names are defined in counters table configuration file, each counter has a unique name and +defined by block name and event id. The traces and trace parameters names are same as in +the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, +see below in the "Context API" section. + +Profiling info kind: + +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metrics count + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // traces count +} rocprofiler_info_kind_t; + +Profiling info data: + +typedef struct { + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + uint32_t instances; // instances number + const char* expr; // metric expression, NULL for basic counters + const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number + // parameters + } trace; + }; +} rocprofiler_info_data_t; + +Return info for a given info kind: + +has_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + void *data); // data passed to callback + +Iterate over the info for a given info kind, and invoke an application-defined callback on +every iteration: + +has_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); + +Iterate over the info for a given info query, and invoke an application-defined callback on +every iteration. The query +fields set to NULL define the query wildcard: + +has_status_t rocprofiler_query_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + rocprofiler_info_data_t query, // info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // data passed to callback +``` +### 4.4. Context API +``` +Profiling context is accumulating all profiling information including profiling features +which carry profiling data, required buffers for profiling command packets and output data. +The context can be created and deleted by the library open/close methods. By deleting +the context all accumulated by the library resources associated with this context will be +released. If it is required more than one run to collect all requested counters data then +data for all profiling groups should be collected and then the metrics can be calculated by +loading the saved groups' data to the profiling context. Saving and loading of the groups +data is responsibility of the tool. The groups are automatically identified on the profiling +context open and there is API to access them, see the "Profiling groups" section below. + +Profiling context handle: + +typename rocprofiler_t; + +Profiling feature kind: + +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, // metric + ROCPROFILER_FEATURE_KIND_TRACE = 1 // trace +} rocprofiler_feature_kind_t; + +Profiling feature parameter: + +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_feature_parameter_t; + +Profiling data kind: + +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, // data uninitialized + ROCPROFILER_DATA_KIND_INT32 = 1, // 32bit integer + ROCPROFILER_DATA_KIND_INT64 = 2, // 64bit integer + ROCPROFILER_DATA_KIND_FLOAT = 3, // float single-precision result + ROCPROFILER_DATA_KIND_DOUBLE = 4, // float double-precision result + ROCPROFILER_DATA_KIND_BYTES = 5 // trace output as a bytes array +} rocprofiler_data_kind_t; + + +Profiling data: + +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + typedef struct { + void* ptr; // pointer + uint32_t size; // byte size + uint32_t instances; // number of trace instances + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +Profiling feature: + +typedef struct { + rocprofiler_feature_kind_t type; // feature type + const char* name; // feature name + const rocprofiler_feature_parameter_t* parameters; // feature parameters + uint32_t parameter_count; // feature parameter count + rocprofiler_data_t* data; // profiling data +} rocprofiler_feature_t; + +Profiling mode masks: +There are several modes which can be specified for the profiling context. +STANDALONE mode can be used for the counters sampling in another then application context +to support statistical system wide profiling. In this mode the profiling context supports +its own queue which can be created on the context open if the CREATEQUEUE mode also specified. +See also "Profiler properties" section below for the standalone mode queue properties. +The profiler supports several profiling groups for collecting profiling data in several +runs and 'SINGLEGROUP' mode allows only one group and the context open will fail if more +groups are needed. + +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler + // supports own AQL queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // profiler creates queue in STANDALONE mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // profiler allows one group only and fails + // if more groups are needed +} rocprofiler_mode_t; + +Context data readiness callback: + +typedef void (*rocprofiler_context_callback_t)( + rocprofiler_group_t* group, // profiling group + void* arg); // callback arg + +Profiler properties: +There are several properties which can be specified for the context. A callback can be +registered which will be called when the context data is ready. In standalone profiling mode +'ROCPROFILER_MODE_STANDALONE' the context supports its own queue and the queue can be set by +the property 'queue' or a queue will be created with the specified depth 'queue_depth' if mode +'ROCPROFILER_MODE_CREATEQUEUE' also specified. + +typedef struct { + rocprofiler_context_callback_t callback; // callback on the context data readiness + void* callback_arg; // callback arg + has_queue_t* queue; // HSA queue for standalone mode + uint32_t queue_depth; // created queue depth,for create-queue mode +} rocprofiler_properties_t; + +Open/close profiling context: + +hsa_status_t rocprofiler_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiler properties + +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +Profiling groups: +The profiler on the context open automatically identifies a required number of the application +runs to collect all data needed for all specified metrics and creates a metric group per each +run. Data for all profiling groups should be collected and then the metrics can be calculated +by loading the saved groups' data to the profiling context. Saving and loading of he groups +data is responsibility of the tool. + +typedef struct { + uint32_t index; // profiling group index + rocprofiler_feature_t** features; // profiling features array + uint32_t feature_count; // profiling feature count + rocprofiler_t* context; // profiling context handle +} rocprofiler_group_t; + +Return profiling groups count: + +hsa_status_t rocprofiler_group_count( + rocprofiler_t* context); // [in/out] profiling context + uint32* count); // [out] profiling groups count + +Return the profiling group for a given index: + +hsa_status_t rocprofiler_get_group( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group); // [out] profiling group + +Calculate metrics data. The data will be stored to the registered profiling features data fields: +After all profiling context data is ready the registered metrics can be calculated. The context +data readiness can be checked by 'get_data' API or using the context callback. + +hsa_status_t rocprofiler_get_metrics( + rocprofiler_t* context); // [in/out] profiling context + +Method for iterating trace data instances: +Trace data can have several instance, for example, one instance per Shader Engine. + +hsa_status_t rocprofiler_iterate_trace_data( + const rocprofiler_t* contex, // [in] context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate + // the output data + void* callback_data); // [in/out] passed to callback data + +Converting of profiling timestamp to time value for suported time ID. +Supported time value ID enumeration: +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +Method for converting of profiling timestamp to time value for a given time ID: +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular + // time to convert the timestamp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time ‘ns’ value +``` +### 4.5. Sampling API +``` +The API supports the counters sampling usage model with start/read/stop methods and also lets +to wait for the profiling data in the intercepting usage model with get_data method. + +Start/stop/read methods: + +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_read( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Wait for profiling data: + +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Group versions of the above start/stop/read/get_data methods: + +hsa_status_t rocprofiler_group_start( + rocprofiler_group_t* group); // [in/out] profiling group + +hsa_status_t rocprofiler_group_stop( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_read( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_get_data( + rocprofiler_group_t* group); // [in/out] profiling group +``` +### 4.6. Intercepting API +``` +The library provides a callback API for enabling profiling for the kernels dispatched to +HSA AQL queues. The API enables per-kernel profiling data collection. +Currently implemented the option with serializing the kernels execution. + +ROC profiler callback type: + +hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // callback data passed by HSA runtime + void* user_data, // [in/out] user data passed + // to the callback + rocprofiler_group** group); // [out] returned profiling group + +Profiling callback data: + +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +Queue callbacks: + +typedef struct { + rocprofiler_callback_t dispatch; // kernel dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // queue destroy callback +} rocprofiler_queue_callbacks_t; + +Adding/removing kernel dispatch and queue destroy callbacks + +hsa_status_t rocprofiler_set_intercepting( + rocprofiler_intercepting_t callbacks, // intercepting callbacks + void* data); // [in/out] passed callbacks data + +hsa_status_t rocprofiler_remove_intercepting(); +``` +### 4.7. Profiling Context Pools +``` +The API provide capability to create a context pool for a given agent and a set of features, to fetch/release a context entry, to register a callback for pool’s contexts completion. +Profiling pool handle: +typename rocprofiler_pool_t; +Profiling pool entry: +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +Profiling handler, calling on profiling completion: +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +Profiling properties: +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +Open profiling pool: +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +Close profiling pool: +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +Fetch profiling pool entry: +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +Release profiling pool entry: +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +Iterate fetched profiling pool entries: +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), + // callback + void *data); // [in/out] data passed to callback + +Flush completed entries in profiling pool: +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle +``` +## 5. Application code examples +### 5.1. Querying available metrics +``` +Info data callback: + + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + if (info.metric.expr != NULL) { + fprintf(stdout, "Derived counter: gpu-agent%d : %s : %s\n", + info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "Basic counter: gpu-agent%d : %s", + info.agent_index, info.metric.name); + if (info.metric.instances > 1) { + fprintf(stdout, "[0-%u]", info.metric.instances - 1); + } + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", + info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + +Printing all available metrics: + + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + +``` +### 5.2. Profiling code example +``` +Profiling of L1 miss ratio, average memory bandwidth. +In the example below rocprofiler_group_get_data group APIs are used for the purpose of a usage +example but in SINGLEGROUP mode when only one group is allowed the context handle itself can be +saved and then direct context method rocprofiler_get_data with default group index equal to 0 +can be used. + +hsa_status_t dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; +} + +Profiling tool constructor is adding the dispatch callback: + +void profiling_libary_constructor() { + // Defining callback data, no data in this simple example + void* callback_data = NULL; + + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + + + // Dispatching profiled kernel + +} + +void profiling_libary_destructor() { + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } +} +``` +### 5.3. Option to use completion callback +``` +Creating profiling context with completion callback: + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Definition of completion callback: + +void completion_callback(profiler_group_t group, void* arg) { + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` +### 5.4. Option to Use Context Pool +``` +Code example of context pool usage. +Creating profiling contexts pool: + . . . + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; + status = rocprofiler_pool_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Fetching a context entry: + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast + (pool_entry.payload); +``` +### 5.5. Standalone Sampling Usage Code Example +``` +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. +To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler +library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. + // Sampling rate + uint32_t sampling_rate = ; + // Sampling count + uint32_t sampling_count = ; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // HSA agent + hsa_agent_t agent; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Getting HSA agent + + + // Profiling feature objects + const unsigned feature_count = 2; + rocprofiler_feature_t feature[feature_count]; + + // Counters and metrics + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GPUBusy"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "SQ_WAVES"; + + // Creating profiling context with standalone queue + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(agent, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE| ROCPROFILER_MODE_CREATEQUEUE| + ROCPROFILER_MODE_SINGLEGROUP, &properties); + + + // Start counters and sample them in the loop with the sampling rate + status = rocprofiler_start(context, 0); + + + for (unsigned ind = 0; ind < sampling_count; ++ind) { + sleep(sampling_rate); + status = rocprofiler_read(context, 0); + + status = rocprofiler_get_data(context, 0); + + status = rocprofiler_get_metrics(context); + + print_results(feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + +``` +### 5.6. Printing Out Profiling Results +``` +Below is a code example for printing out the profiling results from profiling features array: +void print_results(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) + { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << " result_int64 (" << p->data.result_int64 << ")" + << std::endl; + break; + + case ROCPROFILER_DATA_KIND_BYTES: { + std::cout << " result_bytes ptr(" << p->data.result_bytes.ptr << + ") " << " size(" << p->data.result_bytes.size << ")" + << " instance_count(" << p->data.result_bytes.instance_count + << ")"; + break; + } + default: + std::cout << "bad result kind (" << p->data.kind << ")" + << std::endl; + + } + } +} +``` diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index c2a79af2..8da94414 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -103,6 +103,26 @@ + + + + + + + + + + + + + + + + + + + +