Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions metrix/src/metrix/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,9 @@ class DeviceSpecs:

# Memory specs
hbm_bandwidth_gbs: float = 0.0
l2_bandwidth_gbs: float = 0.0
l2_size_mb: float = 0.0
lds_size_per_cu_kb: float = 0.0

# Compute capabilities
fp32_tflops: float = 0.0
fp64_tflops: float = 0.0
int8_tops: float = 0.0

# Clock speeds
boost_clock_mhz: int = 0

Expand Down Expand Up @@ -144,6 +138,7 @@ def _load_yaml_metrics_if_available(self):

# Parse YAML and collect metrics matching this architecture first
yaml_metrics = {}
yaml_unsupported = {}
counters_section = yaml_data["rocprofiler-sdk"].get("counters", [])

for counter_def in counters_section:
Expand All @@ -164,13 +159,16 @@ def _load_yaml_metrics_if_available(self):
if definition is None:
continue

# Check if this metric is marked unsupported for this architecture
unsupported_reason = definition.get("unsupported_reason")
if unsupported_reason:
yaml_unsupported[counter_name] = unsupported_reason
continue

# Register counters: derived, reduce(), and built-in
if "expression" in definition:
expression = definition["expression"]

# Check if this is a simple reduce() expression
import re

reduce_match = re.match(
r"^reduce\([A-Z_0-9]+,\s*(?:sum|max|min)\)$", expression.strip()
)
Expand All @@ -193,15 +191,31 @@ def _load_yaml_metrics_if_available(self):
"compute": lambda cn=counter_name: self._raw_data.get(cn, 0.0),
}

if not yaml_metrics:
if not yaml_metrics and not yaml_unsupported:
return

# YAML metrics found for this arch -- replace @metric-based metrics
self._metrics.clear()
self._unsupported_metrics.clear()
self._metrics.update(yaml_metrics)
self._unsupported_metrics.update(yaml_unsupported)
print(f"✓ Loaded {len(self._metrics)} YAML-based metrics for {arch}")

@property
def _builtin_expression_vars(self) -> set:
"""Variables injected into YAML expression namespace (not hardware counters).

Derived from DeviceSpecs fields + DURATION_US so it stays in sync
automatically when new spec fields are added.
"""
import dataclasses

names = {f.name.upper() for f in dataclasses.fields(self.device_specs)}
names.discard("ARCH")
names.discard("NAME")
names.add("DURATION_US")
return names

def _extract_counters_from_expression(self, expression: str) -> List[str]:
"""Extract counter names from YAML expression"""
import re
Expand All @@ -212,10 +226,10 @@ def _extract_counters_from_expression(self, expression: str) -> List[str]:
for match in re.finditer(r"reduce\(([A-Z_0-9]+),\s*(?:sum|max|min)\)", expression):
counters.add(match.group(1))

# Extract standalone counter names
# Extract standalone counter names (uppercase identifiers)
for match in re.finditer(r"\b([A-Z][A-Z_0-9]*(?:_sum)?)\b", expression):
counter_name = match.group(1)
if counter_name not in ["CU_NUM"]:
if counter_name not in self._builtin_expression_vars:
counters.add(counter_name)

return sorted(list(counters))
Expand All @@ -225,8 +239,15 @@ def _create_yaml_compute_function(self, expression: str, metric_name: str):
import re

def compute():
import dataclasses

namespace = dict(self._raw_data)
namespace["CU_NUM"] = self.device_specs.num_cu

# Inject all DeviceSpecs fields as UPPER_CASE variables
for f in dataclasses.fields(self.device_specs):
if f.name not in ("arch", "name"):
namespace[f.name.upper()] = getattr(self.device_specs, f.name)
namespace["DURATION_US"] = getattr(self, "_current_duration_us", 0.0)

# Replace reduce(X, op) with X_op
processed_expr = re.sub(
Expand Down
272 changes: 268 additions & 4 deletions metrix/src/metrix/backends/counter_defs.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,278 @@
# Public counter definitions for IntelliKit metrix
# These use counters available via rocprofv3 --list-avail (no proprietary data)
# Counter definitions for IntelliKit metrix
# Single source of truth for all architecture metric definitions.
# Uses counters available via rocprofv3 --list-avail (no proprietary data).
#
# Built-in variables available in expressions (from DeviceSpecs):
# NUM_CU, MAX_WAVES_PER_CU, WAVEFRONT_SIZE, BASE_CLOCK_MHZ,
# HBM_BANDWIDTH_GBS, L2_SIZE_MB, LDS_SIZE_PER_CU_KB,
# BOOST_CLOCK_MHZ, DURATION_US

rocprofiler-sdk:
counters:
- name: GPU_UTILIZATION

# --- GPU utilization ---

- name: compute.gpu_utilization
definitions:
- expression: (GRBM_GUI_ACTIVE / GRBM_COUNT) * 100
architectures: [gfx1201, gfx1151]

- name: L2_HIT_RATE
# --- Cache hit rates ---

- name: memory.l2_hit_rate
definitions:
- expression: (TCC_HIT_sum / (TCC_HIT_sum + TCC_MISS_sum)) * 100
architectures: [gfx942, gfx90a]
- expression: (GL2C_HIT_sum / (GL2C_HIT_sum + GL2C_MISS_sum)) * 100
architectures: [gfx1201, gfx1151]

- name: memory.l1_hit_rate
definitions:
- expression: >-
((TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum)
/ TCP_TOTAL_CACHE_ACCESSES_sum) * 100
architectures: [gfx942, gfx90a]

# --- L2 bandwidth ---

- name: memory.l2_bandwidth
definitions:
- expression: >-
(((TCC_HIT_sum + TCC_MISS_sum) * 128) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
architectures: [gfx942, gfx90a]

# --- Bytes transferred ---

- name: memory.bytes_transferred_l2
definitions:
- expression: TCC_REQ_sum * 128
architectures: [gfx942, gfx90a]

- name: memory.bytes_transferred_l1
definitions:
- expression: TCP_TOTAL_CACHE_ACCESSES_sum * 128
architectures: [gfx942]
- expression: TCP_TOTAL_CACHE_ACCESSES_sum * 64
architectures: [gfx90a]

# --- Coalescing and efficiency ---

- name: memory.coalescing_efficiency
definitions:
- expression: >-
min(((SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR) * 16
/ TCP_TOTAL_ACCESSES_sum) * 100, 100.0)
architectures: [gfx942, gfx90a]

- name: memory.global_load_efficiency
definitions:
- expression: >-
min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0)
architectures: [gfx942, gfx90a]

- name: memory.global_store_efficiency
definitions:
- expression: >-
min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0)
architectures: [gfx942, gfx90a]

# --- LDS ---

- name: memory.lds_bank_conflicts
definitions:
- expression: SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS
architectures: [gfx942, gfx90a]

# --- HBM read bandwidth ---
# gfx942 has TCC_BUBBLE_sum for 128B reads; gfx90a does not

- name: memory.hbm_read_bandwidth
definitions:
- expression: >-
((TCC_BUBBLE_sum * 128
+ (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+ TCC_EA0_RDREQ_32B_sum * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
architectures: [gfx942]
- expression: >-
(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+ TCC_EA_RDREQ_32B_sum * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
architectures: [gfx90a]

# --- HBM write bandwidth ---

- name: memory.hbm_write_bandwidth
definitions:
- expression: >-
((TCC_EA0_WRREQ_64B_sum * 64
+ (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
architectures: [gfx942]
- expression: >-
((TCC_EA_WRREQ_64B_sum * 64
+ (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
architectures: [gfx90a]

# --- HBM bandwidth utilization ---

- name: memory.hbm_bandwidth_utilization
definitions:
- expression: >-
((TCC_BUBBLE_sum * 128
+ (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+ TCC_EA0_RDREQ_32B_sum * 32
+ TCC_EA0_WRREQ_64B_sum * 64
+ (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
/ HBM_BANDWIDTH_GBS * 100
architectures: [gfx942]
- expression: >-
(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+ TCC_EA_RDREQ_32B_sum * 32
+ TCC_EA_WRREQ_64B_sum * 64
+ (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9)
/ (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
/ HBM_BANDWIDTH_GBS * 100
architectures: [gfx90a]

# --- Total HBM bytes ---

- name: memory.bytes_transferred_hbm
definitions:
- expression: >-
TCC_BUBBLE_sum * 128
+ (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+ TCC_EA0_RDREQ_32B_sum * 32
+ TCC_EA0_WRREQ_64B_sum * 64
+ (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
architectures: [gfx942]
- expression: >-
(TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+ TCC_EA_RDREQ_32B_sum * 32
+ TCC_EA_WRREQ_64B_sum * 64
+ (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
architectures: [gfx90a]

# --- Atomic latency ---

- name: memory.atomic_latency
definitions:
- expression: TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum
architectures: [gfx942]
- unsupported_reason: >-
TCC_EA_ATOMIC_LEVEL_sum counter is broken on MI200 (gfx90a).
This metric only works correctly on MI300X (gfx942) and newer GPUs.
architectures: [gfx90a]

# --- Compute: total FLOPS ---

- name: compute.total_flops
definitions:
- expression: >-
64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)
architectures: [gfx942, gfx90a]

# --- Compute: GFLOPS ---

- name: compute.hbm_gflops
definitions:
- expression: >-
((64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ 1e9) / (DURATION_US / 1e6)
if DURATION_US > 0 else 0.0
architectures: [gfx942, gfx90a]

# --- Compute: HBM arithmetic intensity ---

- name: compute.hbm_arithmetic_intensity
definitions:
- expression: >-
(64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ (TCC_BUBBLE_sum * 128
+ (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+ TCC_EA0_RDREQ_32B_sum * 32
+ TCC_EA0_WRREQ_64B_sum * 64
+ (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)
architectures: [gfx942]
- expression: >-
(64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+ TCC_EA_RDREQ_32B_sum * 32
+ TCC_EA_WRREQ_64B_sum * 64
+ (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
architectures: [gfx90a]

# --- Compute: L2 arithmetic intensity ---

- name: compute.l2_arithmetic_intensity
definitions:
- expression: >-
(64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ (TCC_REQ_sum * 128)
architectures: [gfx942, gfx90a]

# --- Compute: L1 arithmetic intensity ---
# L1 cache line: 128 bytes on gfx942, 64 bytes on gfx90a

- name: compute.l1_arithmetic_intensity
definitions:
- expression: >-
(64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ (TCP_TOTAL_CACHE_ACCESSES_sum * 128)
architectures: [gfx942]
- expression: >-
(64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+ SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+ (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+ SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+ SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+ 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+ SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
/ (TCP_TOTAL_CACHE_ACCESSES_sum * 64)
architectures: [gfx90a]
Loading
Loading