diff --git a/metrix/src/metrix/backends/base.py b/metrix/src/metrix/backends/base.py index 65c8cc5..aee27fb 100644 --- a/metrix/src/metrix/backends/base.py +++ b/metrix/src/metrix/backends/base.py @@ -26,15 +26,9 @@ class DeviceSpecs: # Memory specs hbm_bandwidth_gbs: float = 0.0 - l2_bandwidth_gbs: float = 0.0 l2_size_mb: float = 0.0 lds_size_per_cu_kb: float = 0.0 - # Compute capabilities - fp32_tflops: float = 0.0 - fp64_tflops: float = 0.0 - int8_tops: float = 0.0 - # Clock speeds boost_clock_mhz: int = 0 @@ -144,6 +138,7 @@ def _load_yaml_metrics_if_available(self): # Parse YAML and collect metrics matching this architecture first yaml_metrics = {} + yaml_unsupported = {} counters_section = yaml_data["rocprofiler-sdk"].get("counters", []) for counter_def in counters_section: @@ -164,13 +159,16 @@ def _load_yaml_metrics_if_available(self): if definition is None: continue + # Check if this metric is marked unsupported for this architecture + unsupported_reason = definition.get("unsupported_reason") + if unsupported_reason: + yaml_unsupported[counter_name] = unsupported_reason + continue + # Register counters: derived, reduce(), and built-in if "expression" in definition: expression = definition["expression"] - # Check if this is a simple reduce() expression - import re - reduce_match = re.match( r"^reduce\([A-Z_0-9]+,\s*(?:sum|max|min)\)$", expression.strip() ) @@ -193,15 +191,31 @@ def _load_yaml_metrics_if_available(self): "compute": lambda cn=counter_name: self._raw_data.get(cn, 0.0), } - if not yaml_metrics: + if not yaml_metrics and not yaml_unsupported: return # YAML metrics found for this arch -- replace @metric-based metrics self._metrics.clear() self._unsupported_metrics.clear() self._metrics.update(yaml_metrics) + self._unsupported_metrics.update(yaml_unsupported) print(f"✓ Loaded {len(self._metrics)} YAML-based metrics for {arch}") + @property + def _builtin_expression_vars(self) -> set: + """Variables injected into YAML expression namespace (not hardware counters). + + Derived from DeviceSpecs fields + DURATION_US so it stays in sync + automatically when new spec fields are added. + """ + import dataclasses + + names = {f.name.upper() for f in dataclasses.fields(self.device_specs)} + names.discard("ARCH") + names.discard("NAME") + names.add("DURATION_US") + return names + def _extract_counters_from_expression(self, expression: str) -> List[str]: """Extract counter names from YAML expression""" import re @@ -212,10 +226,10 @@ def _extract_counters_from_expression(self, expression: str) -> List[str]: for match in re.finditer(r"reduce\(([A-Z_0-9]+),\s*(?:sum|max|min)\)", expression): counters.add(match.group(1)) - # Extract standalone counter names + # Extract standalone counter names (uppercase identifiers) for match in re.finditer(r"\b([A-Z][A-Z_0-9]*(?:_sum)?)\b", expression): counter_name = match.group(1) - if counter_name not in ["CU_NUM"]: + if counter_name not in self._builtin_expression_vars: counters.add(counter_name) return sorted(list(counters)) @@ -225,8 +239,15 @@ def _create_yaml_compute_function(self, expression: str, metric_name: str): import re def compute(): + import dataclasses + namespace = dict(self._raw_data) - namespace["CU_NUM"] = self.device_specs.num_cu + + # Inject all DeviceSpecs fields as UPPER_CASE variables + for f in dataclasses.fields(self.device_specs): + if f.name not in ("arch", "name"): + namespace[f.name.upper()] = getattr(self.device_specs, f.name) + namespace["DURATION_US"] = getattr(self, "_current_duration_us", 0.0) # Replace reduce(X, op) with X_op processed_expr = re.sub( diff --git a/metrix/src/metrix/backends/counter_defs.yaml b/metrix/src/metrix/backends/counter_defs.yaml index dd08279..ba9222b 100644 --- a/metrix/src/metrix/backends/counter_defs.yaml +++ b/metrix/src/metrix/backends/counter_defs.yaml @@ -1,14 +1,278 @@ -# Public counter definitions for IntelliKit metrix -# These use counters available via rocprofv3 --list-avail (no proprietary data) +# Counter definitions for IntelliKit metrix +# Single source of truth for all architecture metric definitions. +# Uses counters available via rocprofv3 --list-avail (no proprietary data). +# +# Built-in variables available in expressions (from DeviceSpecs): +# NUM_CU, MAX_WAVES_PER_CU, WAVEFRONT_SIZE, BASE_CLOCK_MHZ, +# HBM_BANDWIDTH_GBS, L2_SIZE_MB, LDS_SIZE_PER_CU_KB, +# BOOST_CLOCK_MHZ, DURATION_US rocprofiler-sdk: counters: - - name: GPU_UTILIZATION + + # --- GPU utilization --- + + - name: compute.gpu_utilization definitions: - expression: (GRBM_GUI_ACTIVE / GRBM_COUNT) * 100 architectures: [gfx1201, gfx1151] - - name: L2_HIT_RATE + # --- Cache hit rates --- + + - name: memory.l2_hit_rate definitions: + - expression: (TCC_HIT_sum / (TCC_HIT_sum + TCC_MISS_sum)) * 100 + architectures: [gfx942, gfx90a] - expression: (GL2C_HIT_sum / (GL2C_HIT_sum + GL2C_MISS_sum)) * 100 architectures: [gfx1201, gfx1151] + + - name: memory.l1_hit_rate + definitions: + - expression: >- + ((TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum) + / TCP_TOTAL_CACHE_ACCESSES_sum) * 100 + architectures: [gfx942, gfx90a] + + # --- L2 bandwidth --- + + - name: memory.l2_bandwidth + definitions: + - expression: >- + (((TCC_HIT_sum + TCC_MISS_sum) * 128) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + architectures: [gfx942, gfx90a] + + # --- Bytes transferred --- + + - name: memory.bytes_transferred_l2 + definitions: + - expression: TCC_REQ_sum * 128 + architectures: [gfx942, gfx90a] + + - name: memory.bytes_transferred_l1 + definitions: + - expression: TCP_TOTAL_CACHE_ACCESSES_sum * 128 + architectures: [gfx942] + - expression: TCP_TOTAL_CACHE_ACCESSES_sum * 64 + architectures: [gfx90a] + + # --- Coalescing and efficiency --- + + - name: memory.coalescing_efficiency + definitions: + - expression: >- + min(((SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR) * 16 + / TCP_TOTAL_ACCESSES_sum) * 100, 100.0) + architectures: [gfx942, gfx90a] + + - name: memory.global_load_efficiency + definitions: + - expression: >- + min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0) + architectures: [gfx942, gfx90a] + + - name: memory.global_store_efficiency + definitions: + - expression: >- + min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0) + architectures: [gfx942, gfx90a] + + # --- LDS --- + + - name: memory.lds_bank_conflicts + definitions: + - expression: SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS + architectures: [gfx942, gfx90a] + + # --- HBM read bandwidth --- + # gfx942 has TCC_BUBBLE_sum for 128B reads; gfx90a does not + + - name: memory.hbm_read_bandwidth + definitions: + - expression: >- + ((TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + architectures: [gfx942] + - expression: >- + (((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + + TCC_EA_RDREQ_32B_sum * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + architectures: [gfx90a] + + # --- HBM write bandwidth --- + + - name: memory.hbm_write_bandwidth + definitions: + - expression: >- + ((TCC_EA0_WRREQ_64B_sum * 64 + + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + architectures: [gfx942] + - expression: >- + ((TCC_EA_WRREQ_64B_sum * 64 + + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + architectures: [gfx90a] + + # --- HBM bandwidth utilization --- + + - name: memory.hbm_bandwidth_utilization + definitions: + - expression: >- + ((TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32 + + TCC_EA0_WRREQ_64B_sum * 64 + + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + / HBM_BANDWIDTH_GBS * 100 + architectures: [gfx942] + - expression: >- + (((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + + TCC_EA_RDREQ_32B_sum * 32 + + TCC_EA_WRREQ_64B_sum * 64 + + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9) + / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6)) + / HBM_BANDWIDTH_GBS * 100 + architectures: [gfx90a] + + # --- Total HBM bytes --- + + - name: memory.bytes_transferred_hbm + definitions: + - expression: >- + TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32 + + TCC_EA0_WRREQ_64B_sum * 64 + + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 + architectures: [gfx942] + - expression: >- + (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + + TCC_EA_RDREQ_32B_sum * 32 + + TCC_EA_WRREQ_64B_sum * 64 + + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32 + architectures: [gfx90a] + + # --- Atomic latency --- + + - name: memory.atomic_latency + definitions: + - expression: TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum + architectures: [gfx942] + - unsupported_reason: >- + TCC_EA_ATOMIC_LEVEL_sum counter is broken on MI200 (gfx90a). + This metric only works correctly on MI300X (gfx942) and newer GPUs. + architectures: [gfx90a] + + # --- Compute: total FLOPS --- + + - name: compute.total_flops + definitions: + - expression: >- + 64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64) + architectures: [gfx942, gfx90a] + + # --- Compute: GFLOPS --- + + - name: compute.hbm_gflops + definitions: + - expression: >- + ((64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / 1e9) / (DURATION_US / 1e6) + if DURATION_US > 0 else 0.0 + architectures: [gfx942, gfx90a] + + # --- Compute: HBM arithmetic intensity --- + + - name: compute.hbm_arithmetic_intensity + definitions: + - expression: >- + (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / (TCC_BUBBLE_sum * 128 + + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 + + TCC_EA0_RDREQ_32B_sum * 32 + + TCC_EA0_WRREQ_64B_sum * 64 + + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + architectures: [gfx942] + - expression: >- + (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + + TCC_EA_RDREQ_32B_sum * 32 + + TCC_EA_WRREQ_64B_sum * 64 + + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) + architectures: [gfx90a] + + # --- Compute: L2 arithmetic intensity --- + + - name: compute.l2_arithmetic_intensity + definitions: + - expression: >- + (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / (TCC_REQ_sum * 128) + architectures: [gfx942, gfx90a] + + # --- Compute: L1 arithmetic intensity --- + # L1 cache line: 128 bytes on gfx942, 64 bytes on gfx90a + + - name: compute.l1_arithmetic_intensity + definitions: + - expression: >- + (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / (TCP_TOTAL_CACHE_ACCESSES_sum * 128) + architectures: [gfx942] + - expression: >- + (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2) + + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2) + + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2)) + + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16 + + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)) + / (TCP_TOTAL_CACHE_ACCESSES_sum * 64) + architectures: [gfx90a] diff --git a/metrix/src/metrix/backends/device_info.py b/metrix/src/metrix/backends/device_info.py new file mode 100644 index 0000000..ddb56ca --- /dev/null +++ b/metrix/src/metrix/backends/device_info.py @@ -0,0 +1,295 @@ +""" +Dynamic GPU device info from rocminfo / rocm-smi + peak spec lookup table. + +Queryable values (num_cu, wavefront_size, etc.) are read from the live +system so that one backend class works for every SKU in an architecture +family (e.g. MI210 vs MI250X both use gfx90a). + +Theoretical peak values (TFLOPS, HBM bandwidth) that cannot be read from +hardware are stored in a small per-chip-ID table with source links. + +When the requested arch does not match the GPU actually installed (e.g. +unit tests creating a gfx942 backend on an MI210 machine), static +fallback specs are used instead. +""" + +from __future__ import annotations + +import re +import subprocess +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict + +if TYPE_CHECKING: + from .base import DeviceSpecs + + +# --------------------------------------------------------------------------- +# Peak specs that cannot be queried from hardware. +# Keyed by (gfx_arch, chip_id_hex) so different SKUs within the same arch +# get correct values. chip_id_hex = None acts as the arch-level default. +# +# Sources are listed per-entry so they can be verified / updated. +# --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Fallback specs used when the requested arch differs from the installed GPU +# (e.g. unit tests) or when rocminfo is unavailable. +# +# Sources: +# HW specs: https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html +# MI300X peaks: https://www.amd.com/en/products/accelerators/instinct/mi300/platform.html +# MI210 peaks: https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html +# --------------------------------------------------------------------------- +def _fallback_specs() -> Dict[str, "DeviceSpecs"]: + from .base import DeviceSpecs + + return { + "gfx942": DeviceSpecs( + arch="gfx942", + name="AMD Instinct MI300X", + num_cu=304, + max_waves_per_cu=32, + wavefront_size=64, + base_clock_mhz=2100.0, + hbm_bandwidth_gbs=5300.0, + l2_size_mb=256.0, + lds_size_per_cu_kb=64.0, + boost_clock_mhz=2100, + ), + "gfx90a": DeviceSpecs( + arch="gfx90a", + name="AMD Instinct MI210", + num_cu=104, + max_waves_per_cu=32, + wavefront_size=64, + base_clock_mhz=1700.0, + hbm_bandwidth_gbs=1600.0, + l2_size_mb=8.0, + lds_size_per_cu_kb=64.0, + boost_clock_mhz=1600, + ), + } + + +# HBM peak bandwidth per arch (only value not queryable from hardware) +_HBM_PEAK_GBS: Dict[str, float] = { + "gfx942": 5300.0, + "gfx90a": 1600.0, +} + + +# --------------------------------------------------------------------------- +# rocminfo parser — one call, structured results for the first GPU agent +# --------------------------------------------------------------------------- +@dataclass +class RocmInfoGPU: + """Parsed GPU agent block from rocminfo.""" + + arch: str = "" + marketing_name: str = "" + chip_id_hex: str = "" + num_cu: int = 0 + simds_per_cu: int = 0 + max_waves_per_cu: int = 0 + wavefront_size: int = 64 + max_clock_mhz: int = 0 + l1_cache_kb: int = 0 + l2_cache_kb: int = 0 + lds_size_kb: int = 0 + + +def _parse_rocminfo() -> RocmInfoGPU: + """Run ``rocminfo`` and parse the first GPU agent.""" + try: + proc = subprocess.run(["rocminfo"], capture_output=True, text=True, timeout=10) + except (FileNotFoundError, subprocess.TimeoutExpired) as exc: + raise RuntimeError(f"rocminfo unavailable: {exc}") from exc + + if proc.returncode != 0: + raise RuntimeError(f"rocminfo failed (rc={proc.returncode}): {proc.stderr}") + + gpu = RocmInfoGPU() + in_gpu_agent = False + in_cache = False + found_group_segment = False + + pending_agent_name = "" + pending_marketing_name = "" + + for line in proc.stdout.splitlines(): + stripped = line.strip() + + if stripped.startswith("*******"): + if in_gpu_agent: + break + pending_agent_name = "" + pending_marketing_name = "" + continue + + if stripped.startswith("Name:") and not in_gpu_agent: + pending_agent_name = stripped.split(":", 1)[1].strip() + continue + + if stripped.startswith("Marketing Name:") and not in_gpu_agent: + pending_marketing_name = stripped.split(":", 1)[1].strip() + continue + + if "Device Type:" in stripped and "GPU" in stripped: + in_gpu_agent = True + in_cache = False + m = re.search(r"(gfx\w+)", pending_agent_name) + if m: + gpu.arch = m.group(1) + gpu.marketing_name = pending_marketing_name + continue + + if not in_gpu_agent: + continue + + if "Device Type:" in stripped and "GPU" not in stripped: + break + + if "Cache Info:" in stripped: + in_cache = True + continue + if "Pool Info:" in stripped or "ISA Info:" in stripped: + in_cache = False + + if in_cache: + m = re.match(r"L1:\s+(\d+)", stripped) + if m: + gpu.l1_cache_kb = int(m.group(1)) + m = re.match(r"L2:\s+(\d+)", stripped) + if m: + gpu.l2_cache_kb = int(m.group(1)) + + if stripped.startswith("Chip ID:"): + m = re.search(r"\((0x[0-9a-fA-F]+)\)", stripped) + if m: + gpu.chip_id_hex = m.group(1).lower() + elif stripped.startswith("Compute Unit:"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.num_cu = int(m.group(1)) + elif stripped.startswith("SIMDs per CU:"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.simds_per_cu = int(m.group(1)) + elif stripped.startswith("Max Waves Per CU:"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.max_waves_per_cu = int(m.group(1)) + elif stripped.startswith("Wavefront Size:"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.wavefront_size = int(m.group(1)) + elif stripped.startswith("Max Clock Freq"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.max_clock_mhz = int(m.group(1)) + elif "Segment:" in stripped and "GROUP" in stripped: + found_group_segment = True + elif found_group_segment and stripped.startswith("Size:"): + m = re.search(r"(\d+)", stripped.split(":")[1]) + if m: + gpu.lds_size_kb = int(m.group(1)) + found_group_segment = False + + return gpu + + +def _parse_rocm_smi_json() -> dict: + """Run ``rocm-smi --json`` with relevant flags and return first card.""" + try: + proc = subprocess.run( + [ + "rocm-smi", + "--showproductname", + "--showsclkrange", + "--showmclkrange", + "--showmeminfo", + "all", + "--json", + ], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return {} + + if proc.returncode != 0: + return {} + + import json + + try: + data = json.loads(proc.stdout) + except json.JSONDecodeError: + return {} + + for key in sorted(data.keys()): + if key.startswith("card"): + return data[key] + return {} + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- +def query_device_specs(arch: str) -> "DeviceSpecs": + """ + Build a DeviceSpecs by querying rocminfo/rocm-smi for live values. + + If the requested *arch* does not match the GPU actually installed + (common in unit tests), a static fallback is returned instead. + + Args: + arch: GFX architecture string (e.g. "gfx90a", "gfx942") + + Returns: + Fully populated DeviceSpecs + """ + from .base import DeviceSpecs + + # Try live query + hw_arch = None + try: + gpu = _parse_rocminfo() + hw_arch = gpu.arch or None + except RuntimeError: + gpu = None + + # If the hardware matches the requested arch, use live values + if gpu and hw_arch == arch: + smi = _parse_rocm_smi_json() + + boost_clock_mhz = 0 + mclk_range = smi.get("Valid mclk range", "") + m = re.search(r"(\d+)\s*Mhz\s*$", mclk_range) + if m: + boost_clock_mhz = int(m.group(1)) + + return DeviceSpecs( + arch=arch, + name=gpu.marketing_name or f"AMD GPU ({arch})", + num_cu=gpu.num_cu, + max_waves_per_cu=gpu.max_waves_per_cu, + wavefront_size=gpu.wavefront_size, + base_clock_mhz=float(gpu.max_clock_mhz), + hbm_bandwidth_gbs=_HBM_PEAK_GBS.get(arch, 0.0), + l2_size_mb=gpu.l2_cache_kb / 1024.0, + lds_size_per_cu_kb=float(gpu.lds_size_kb or 64), + boost_clock_mhz=boost_clock_mhz or gpu.max_clock_mhz, + ) + + # Arch mismatch or rocminfo unavailable — use static fallback + fallback = _fallback_specs() + if arch in fallback: + return fallback[arch] + + return DeviceSpecs( + arch=arch, + name=f"AMD GPU ({arch})", + hbm_bandwidth_gbs=_HBM_PEAK_GBS.get(arch, 0.0), + ) diff --git a/metrix/src/metrix/backends/gfx1151.py b/metrix/src/metrix/backends/gfx1151.py index 048c12d..f4e80e3 100644 --- a/metrix/src/metrix/backends/gfx1151.py +++ b/metrix/src/metrix/backends/gfx1151.py @@ -4,16 +4,12 @@ Shares all hardware configuration with gfx1201 (RDNA4). """ -from .base import DeviceSpecs +from .device_info import query_device_specs from .gfx1201 import GFX1201Backend class GFX1151Backend(GFX1201Backend): """AMD RDNA4 (GFX1151) backend - same hardware config as gfx1201.""" - def _get_device_specs(self) -> DeviceSpecs: - return DeviceSpecs( - arch="gfx1151", - name="AMD Radeon Graphics (RDNA4)", - wavefront_size=32, - ) + def _get_device_specs(self): + return query_device_specs("gfx1151") diff --git a/metrix/src/metrix/backends/gfx1201.py b/metrix/src/metrix/backends/gfx1201.py index ee477af..7039de2 100644 --- a/metrix/src/metrix/backends/gfx1201.py +++ b/metrix/src/metrix/backends/gfx1201.py @@ -2,9 +2,12 @@ GFX1201 (RDNA4) Backend Metrics are loaded from counter_defs.yaml. +This file provides architecture-specific infrastructure only. +Device specs are queried from rocminfo / rocm-smi at runtime. """ -from .base import CounterBackend, DeviceSpecs, ProfileResult, Statistics +from .base import CounterBackend, ProfileResult +from .device_info import query_device_specs from ..profiler.rocprof_wrapper import ROCProfV3Wrapper from pathlib import Path from typing import List, Optional @@ -13,42 +16,8 @@ class GFX1201Backend(CounterBackend): """AMD RDNA4 (gfx1201) backend.""" - def get_metric_counters(self, metric: str) -> List[str]: - if metric not in self._metrics: - return [metric] - return list(self._metrics[metric]["counters"]) - - def get_required_counters(self, metrics: List[str]) -> List[str]: - counters = set() - skip = {"duration_us"} - for metric in metrics: - if metric not in self._metrics: - counters.add(metric) - else: - counters.update(c for c in self._metrics[metric]["counters"] if c not in skip) - return list(counters) - - def compute_metric_stats(self, dispatch_key: str, metric: str) -> Statistics: - if dispatch_key not in self._aggregated: - raise KeyError(f"Unknown dispatch key: {dispatch_key}") - counter_stats = self._aggregated[dispatch_key] - if metric not in self._metrics: - if metric in counter_stats: - return counter_stats[metric] - return Statistics(min=0.0, max=0.0, avg=0.0, count=0) - metric_min = self._compute_with_stat_type(metric, counter_stats, "min") - metric_max = self._compute_with_stat_type(metric, counter_stats, "max") - metric_avg = self._compute_with_stat_type(metric, counter_stats, "avg") - first_counter = list(counter_stats.keys())[0] - count = counter_stats[first_counter].count - return Statistics(min=metric_min, max=metric_max, avg=metric_avg, count=count) - - def _get_device_specs(self) -> DeviceSpecs: - return DeviceSpecs( - arch="gfx1201", - name="AMD Radeon Graphics (RDNA4)", - wavefront_size=32, - ) + def _get_device_specs(self): + return query_device_specs("gfx1201") def _run_rocprof( self, diff --git a/metrix/src/metrix/backends/gfx90a.py b/metrix/src/metrix/backends/gfx90a.py index bc38992..d254155 100644 --- a/metrix/src/metrix/backends/gfx90a.py +++ b/metrix/src/metrix/backends/gfx90a.py @@ -1,13 +1,14 @@ """ GFX90a (MI200) Backend -Each metric is defined with @metric decorator. -Counter names appear EXACTLY ONCE - as function parameters. +Metrics are loaded from counter_defs.yaml. +This file provides architecture-specific infrastructure only. +Device specs are queried from rocminfo / rocm-smi at runtime. """ from .base import CounterBackend, DeviceSpecs, ProfileResult +from .device_info import query_device_specs from ..utils.common import split_counters_into_passes -from .decorator import metric from ..profiler.rocprof_wrapper import ROCProfV3Wrapper from typing import List, Optional, Dict @@ -16,28 +17,11 @@ class GFX90aBackend(CounterBackend): """ AMD MI200 (gfx90a) counter backend - All metrics are defined with @metric decorator. - Hardware counter names appear ONLY as function parameter names. + Metric definitions live in counter_defs.yaml. """ def _get_device_specs(self) -> DeviceSpecs: - """MI200 specifications""" - return DeviceSpecs( - arch="gfx90a", - name="AMD Instinct MI200", - num_cu=104, - max_waves_per_cu=32, - wavefront_size=64, - base_clock_mhz=1700.0, - hbm_bandwidth_gbs=3200.0, - l2_bandwidth_gbs=11000.0, - l2_size_mb=16.0, - lds_size_per_cu_kb=64.0, - fp32_tflops=47.9, - fp64_tflops=47.9, - int8_tops=383, - boost_clock_mhz=1600, - ) + return query_device_specs("gfx90a") def _get_counter_groups(self, counters: List[str]) -> List[List[str]]: """ @@ -63,33 +47,18 @@ def _get_counter_block_limits(self) -> Dict[str, int]: These limits define how many performance counters can be simultaneously collected from each hardware block in a single profiling pass. - - Hardware blocks on MI200: - - SQ (Shader): Instruction counters (VALU, LDS, VMEM, etc.) - - TA (Texture Addresser): Texture address operations - - TD (Texture Data): Texture data fetch operations - - TCP (Texture Cache per Pipe): L1 vector cache - - TCC (Texture Cache Channel): L2 cache and memory controller - - CPC (Command Processor - Compute): Compute command processing - - CPF (Command Processor - Fetch): Command fetch operations - - SPI (Shader Processor Input): Wavefront dispatch and scheduling - - GRBM (Graphics Register Bus Manager): Global GPU activity - - GDS (Global Data Share): Inter-workgroup communication - - Returns: - Dict mapping block_name -> max_counters_per_pass """ return { - "SQ": 8, # Shader - instruction counters - "TA": 2, # Texture Addresser - "TD": 2, # Texture Data - "TCP": 4, # L1 Cache (Texture Cache per Pipe) - "TCC": 4, # L2 Cache / Memory Controller - "CPC": 2, # Command Processor - Compute - "CPF": 2, # Command Processor - Fetch - "SPI": 6, # Shader Processor Input - "GRBM": 2, # Graphics Register Bus Manager - "GDS": 4, # Global Data Share + "SQ": 8, + "TA": 2, + "TD": 2, + "TCP": 4, + "TCC": 4, + "CPC": 2, + "CPF": 2, + "SPI": 6, + "GRBM": 2, + "GDS": 4, } def _run_rocprof( @@ -103,537 +72,3 @@ def _run_rocprof( """Run rocprofv3 and return results (single pass only - base class handles multi-pass)""" wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds) return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd) - - # Memory bandwidth metrics - - @metric("memory.hbm_read_bandwidth") - def _hbm_read_bandwidth(self, TCC_EA_RDREQ_sum, TCC_EA_RDREQ_32B_sum, GRBM_GUI_ACTIVE): - """ - HBM read bandwidth in GB/s - - Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) - """ - # Calculate bytes with 32B/64B distinction - bytes_read_64B = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 - bytes_read_32B = TCC_EA_RDREQ_32B_sum * 32 - bytes_read = bytes_read_64B + bytes_read_32B - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - @metric("memory.hbm_write_bandwidth") - def _hbm_write_bandwidth(self, TCC_EA_WRREQ_sum, TCC_EA_WRREQ_64B_sum, GRBM_GUI_ACTIVE): - """ - HBM write bandwidth in GB/s (with 32B/64B request granularity) - - Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) - """ - # Calculate bytes with 32B/64B distinction - bytes_written_64B = TCC_EA_WRREQ_64B_sum * 64 - bytes_written_32B = (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32 - bytes_written = bytes_written_64B + bytes_written_32B - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - @metric("memory.hbm_bandwidth_utilization") - def _hbm_bandwidth_utilization( - self, - TCC_EA_RDREQ_sum, - TCC_EA_RDREQ_32B_sum, - TCC_EA_WRREQ_sum, - TCC_EA_WRREQ_64B_sum, - GRBM_GUI_ACTIVE, - ): - """ - HBM bandwidth utilization as percentage of peak - - Formula: (actual_bandwidth / peak_bandwidth) * 100 - """ - # Calculate bytes with 32B/64B distinction - bytes_read = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32 - bytes_written = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32 - total_bytes = bytes_read + bytes_written - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - actual_bw_gbs = (total_bytes / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100 - - @metric("memory.bytes_transferred_hbm") - def _bytes_transferred_hbm( - self, TCC_EA_RDREQ_sum, TCC_EA_RDREQ_32B_sum, TCC_EA_WRREQ_sum, TCC_EA_WRREQ_64B_sum - ): - """ - Total bytes transferred through HBM - - Formula: (64B_read_requests * 64 + 32B_read_requests * 32 + - 64B_write_requests * 64 + 32B_write_requests * 32) - """ - bytes_read = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32 - bytes_written = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32 - return bytes_read + bytes_written - - @metric("memory.bytes_transferred_l2") - def _bytes_transferred_l2(self, TCC_REQ_sum): - """ - Total bytes transferred through L2 cache - - Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes) - """ - return TCC_REQ_sum * 128 - - @metric("memory.bytes_transferred_l1") - def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum): - """ - Total bytes transferred through L1 cache - - Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 64 (L1 cache line size is 64 bytes) - """ - return TCP_TOTAL_CACHE_ACCESSES_sum * 64 - - # Cache metrics - - @metric("memory.l2_hit_rate") - def _l2_hit_rate(self, TCC_HIT_sum, TCC_MISS_sum): - """ - L2 cache hit rate as percentage - - Formula: (hits / (hits + misses)) * 100 - """ - total = TCC_HIT_sum + TCC_MISS_sum - return (TCC_HIT_sum / total) * 100 if total > 0 else 0.0 - - @metric("memory.l1_hit_rate") - def _l1_hit_rate(self, TCP_TCC_READ_REQ_sum, TCP_TOTAL_CACHE_ACCESSES_sum): - """ - L1 cache hit rate as percentage - - Formula: ((total_accesses - l1_misses) / total_accesses) * 100 - L1 misses go to L2 (TCC), so misses = TCP_TCC_READ_REQ - """ - if TCP_TOTAL_CACHE_ACCESSES_sum == 0: - return 0.0 - - l1_hits = TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum - return (l1_hits / TCP_TOTAL_CACHE_ACCESSES_sum) * 100 - - @metric("memory.l2_bandwidth") - def _l2_bandwidth(self, TCC_HIT_sum, TCC_MISS_sum, GRBM_GUI_ACTIVE): - """ - L2 cache bandwidth in GB/s - - Formula: (total_accesses * 128 bytes) / time - Note: L2 cacheline is 128 bytes - """ - total_accesses = TCC_HIT_sum + TCC_MISS_sum - bytes_accessed = total_accesses * 128 # L2 cacheline size - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_accessed / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - # Coalescing metrics - - @metric("memory.coalescing_efficiency") - def _coalescing_efficiency(self, SQ_INSTS_VMEM_RD, SQ_INSTS_VMEM_WR, TCP_TOTAL_ACCESSES_sum): - """ - Memory coalescing efficiency as percentage - - Formula: (total_memory_instructions * 16 / total_cache_accesses) * 100 - - Physical meaning: - - Perfect coalescing (stride=1): 100% (minimal cache accesses) - - Poor coalescing (stride>1): 25% for float, 50% for double - - This represents actual bandwidth efficiency, not rescaled. - """ - total_instructions = SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR - - if TCP_TOTAL_ACCESSES_sum == 0: - return 0.0 - - # 16 = 64 threads per wavefront / 4 threads per cacheline - efficiency = (total_instructions * 16 / TCP_TOTAL_ACCESSES_sum) * 100 - - # Cap at 100% (can happen due to prefetching) - return min(efficiency, 100.0) - - @metric("memory.global_load_efficiency") - def _global_load_efficiency(self, SQ_INSTS_VMEM_RD, TCP_TCC_READ_REQ_sum): - """ - Global load efficiency - ratio of requested vs fetched memory - - Formula: (read_instructions * 64 bytes / read_requests * 64 bytes) * 100 - Simplifies to: (read_instructions / read_requests) * 100 - """ - if TCP_TCC_READ_REQ_sum == 0: - return 0.0 - - return min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0) - - @metric("memory.global_store_efficiency") - def _global_store_efficiency(self, SQ_INSTS_VMEM_WR, TCP_TCC_WRITE_REQ_sum): - """ - Global store efficiency - ratio of requested vs written memory - - Formula: (write_instructions / write_requests) * 100 - """ - if TCP_TCC_WRITE_REQ_sum == 0: - return 0.0 - - return min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0) - - # LDS metrics - - @metric("memory.lds_bank_conflicts") - def _lds_bank_conflicts(self, SQ_LDS_BANK_CONFLICT, SQ_INSTS_LDS): - """ - LDS bank conflicts per instruction - - Formula: total_conflicts / total_lds_instructions - """ - if SQ_INSTS_LDS == 0: - return 0.0 - - return SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS - - # Atomic metrics - - @metric( - "memory.atomic_latency", - unsupported_reason="TCC_EA_ATOMIC_LEVEL_sum counter is broken on MI200 (gfx90a). " - "This metric only works correctly on MI300X (gfx942) and newer GPUs.", - ) - def _atomic_latency(self, TCC_EA_ATOMIC_LEVEL_sum, TCC_EA_ATOMIC_sum): - """ - Average atomic operation latency in cycles (L2 cache atomic latency) - - Formula: TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum (MI200 counters) - - Note: This measures atomic operations to/from L2 cache, not GDS operations. - GDS (Global Data Share) is a special feature rarely used by most kernels. - """ - if TCC_EA_ATOMIC_sum == 0: - return 0.0 - - return TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum - - # Compute metrics - - @metric("compute.total_flops") - def _total_flops( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - ): - """ - Total floating-point operations performed by the kernel - - Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA - - 64 operations per wave (wavefront size = 64) - - FMA counts as 2 operations (multiply + add) - - MFMA instructions produce 512 operations per instruction - """ - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - return fops - - @metric("compute.hbm_gflops") - def _hbm_gflops( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - ): - """ - Compute throughput (GFLOPS) using profiler kernel duration. - - Formula: (total_flops / 1e9) / (duration_us / 1e6) - Duration is set by the base class from profiler timestamps before calling. - """ - # Calculate total FLOPS (same as compute.total_flops) - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - duration_us = getattr(self, "_current_duration_us", 0.0) - if duration_us <= 0: - return 0.0 - - time_seconds = duration_us / 1e6 - gflops = (fops / 1e9) / time_seconds - - return gflops - - @metric("compute.hbm_arithmetic_intensity") - def _hbm_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCC_EA_RDREQ_sum, - TCC_EA_RDREQ_32B_sum, - TCC_EA_WRREQ_sum, - TCC_EA_WRREQ_64B_sum, - ): - """ - HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte) - - Formula: total_flops / hbm_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate HBM bytes (with 32B/64B/128B distinction) - hbm_rd = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32 - hbm_wr = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32 - hbm_bytes = hbm_rd + hbm_wr - - # Arithmetic intensity = FLOP / byte - ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0 - - return ai_hbm - - @metric("compute.l2_arithmetic_intensity") - def _l2_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCC_REQ_sum, - ): - """ - L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte) - - Formula: total_flops / l2_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate L2 bytes (L2 cache line is 128 bytes) - l2_bytes = TCC_REQ_sum * 128 - - # Arithmetic intensity = FLOP / byte - ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0 - - return ai_l2 - - @metric("compute.l1_arithmetic_intensity") - def _l1_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCP_TOTAL_CACHE_ACCESSES_sum, - ): - """ - L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte) - - Formula: total_flops / l1_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate L1 bytes (L1 cache line is 64 bytes on gfx90a) - l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 64 - - # Arithmetic intensity = FLOP / byte - ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0 - - return ai_l1 diff --git a/metrix/src/metrix/backends/gfx942.py b/metrix/src/metrix/backends/gfx942.py index 3391012..19f974c 100644 --- a/metrix/src/metrix/backends/gfx942.py +++ b/metrix/src/metrix/backends/gfx942.py @@ -1,13 +1,14 @@ """ GFX942 (MI300X) Backend -Each metric is defined with @metric decorator. -Counter names appear EXACTLY ONCE - as function parameters. +Metrics are loaded from counter_defs.yaml. +This file provides architecture-specific infrastructure only. +Device specs are queried from rocminfo / rocm-smi at runtime. """ from .base import CounterBackend, DeviceSpecs, ProfileResult +from .device_info import query_device_specs from ..utils.common import split_counters_into_passes -from .decorator import metric from ..profiler.rocprof_wrapper import ROCProfV3Wrapper from typing import List, Optional, Dict @@ -16,28 +17,11 @@ class GFX942Backend(CounterBackend): """ AMD MI300X (gfx942) counter backend - All metrics are defined with @metric decorator. - Hardware counter names appear ONLY as function parameter names. + Metric definitions live in counter_defs.yaml. """ def _get_device_specs(self) -> DeviceSpecs: - """MI300X specifications""" - return DeviceSpecs( - arch="gfx942", - name="AMD Instinct MI300X", - num_cu=304, - max_waves_per_cu=32, - wavefront_size=64, - base_clock_mhz=2100.0, - hbm_bandwidth_gbs=5300.0, - l2_bandwidth_gbs=11000.0, - l2_size_mb=256.0, - lds_size_per_cu_kb=64.0, - fp32_tflops=163.4, - fp64_tflops=81.7, - int8_tops=1307.4, - boost_clock_mhz=2100, - ) + return query_device_specs("gfx942") def _get_counter_groups(self, counters: List[str]) -> List[List[str]]: """ @@ -63,33 +47,18 @@ def _get_counter_block_limits(self) -> Dict[str, int]: These limits define how many performance counters can be simultaneously collected from each hardware block in a single profiling pass. - - Hardware blocks on MI300X: - - SQ (Shader): Instruction counters (VALU, LDS, VMEM, etc.) - - TA (Texture Addresser): Texture address operations - - TD (Texture Data): Texture data fetch operations - - TCP (Texture Cache per Pipe): L1 vector cache - - TCC (Texture Cache Channel): L2 cache and memory controller - - CPC (Command Processor - Compute): Compute command processing - - CPF (Command Processor - Fetch): Command fetch operations - - SPI (Shader Processor Input): Wavefront dispatch and scheduling - - GRBM (Graphics Register Bus Manager): Global GPU activity - - GDS (Global Data Share): Inter-workgroup communication - - Returns: - Dict mapping block_name -> max_counters_per_pass """ return { - "SQ": 8, # Shader - instruction counters - "TA": 2, # Texture Addresser - "TD": 2, # Texture Data - "TCP": 4, # L1 Cache (Texture Cache per Pipe) - "TCC": 4, # L2 Cache / Memory Controller - "CPC": 2, # Command Processor - Compute - "CPF": 2, # Command Processor - Fetch - "SPI": 6, # Shader Processor Input - "GRBM": 2, # Graphics Register Bus Manager - "GDS": 4, # Global Data Share + "SQ": 8, + "TA": 2, + "TD": 2, + "TCP": 4, + "TCC": 4, + "CPC": 2, + "CPF": 2, + "SPI": 6, + "GRBM": 2, + "GDS": 4, } def _run_rocprof( @@ -103,570 +72,3 @@ def _run_rocprof( """Run rocprofv3 and return results (single pass only - base class handles multi-pass)""" wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds) return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd) - - # Memory bandwidth metrics - - @metric("memory.hbm_read_bandwidth") - def _hbm_read_bandwidth( - self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, GRBM_GUI_ACTIVE - ): - """ - HBM read bandwidth in GB/s - - Formula: (128B_requests * 128 + 64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) - - Note: TCC_EA0_RDREQ_sum aggregates across all memory controllers on MI300 - TCC_BUBBLE_sum counts 128B read requests - """ - # Calculate bytes with 32B/64B/128B distinction - bytes_read_128B = TCC_BUBBLE_sum * 128 - bytes_read_64B = (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 - bytes_read_32B = TCC_EA0_RDREQ_32B_sum * 32 - bytes_read = bytes_read_128B + bytes_read_64B + bytes_read_32B - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - @metric("memory.hbm_write_bandwidth") - def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE): - """ - HBM write bandwidth in GB/s (with 32B/64B request granularity) - - Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq) - - Note: TCC_EA0_WRREQ_sum aggregates across all memory controllers on MI300 - """ - # Calculate bytes with 32B/64B distinction - bytes_written_64B = TCC_EA0_WRREQ_64B_sum * 64 - bytes_written_32B = (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 - bytes_written = bytes_written_64B + bytes_written_32B - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - @metric("memory.hbm_bandwidth_utilization") - def _hbm_bandwidth_utilization( - self, - TCC_EA0_RDREQ_sum, - TCC_EA0_RDREQ_32B_sum, - TCC_BUBBLE_sum, - TCC_EA0_WRREQ_sum, - TCC_EA0_WRREQ_64B_sum, - GRBM_GUI_ACTIVE, - ): - """ - HBM bandwidth utilization as percentage of peak - - Formula: (actual_bandwidth / peak_bandwidth) * 100 - - Note: TCC_EA0_* counters aggregate across all memory controllers on MI300 - TCC_BUBBLE_sum counts 128B read requests - """ - # Calculate bytes with 32B/64B/128B distinction - bytes_read = ( - TCC_BUBBLE_sum * 128 - + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 - + TCC_EA0_RDREQ_32B_sum * 32 - ) - bytes_written = ( - TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 - ) - total_bytes = bytes_read + bytes_written - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - actual_bw_gbs = (total_bytes / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100 - - @metric("memory.bytes_transferred_hbm") - def _bytes_transferred_hbm( - self, - TCC_EA0_RDREQ_sum, - TCC_EA0_RDREQ_32B_sum, - TCC_BUBBLE_sum, - TCC_EA0_WRREQ_sum, - TCC_EA0_WRREQ_64B_sum, - ): - """ - Total bytes transferred through HBM - - Formula: (128B_read_requests * 128 + 64B_read_requests * 64 + 32B_read_requests * 32 + - 64B_write_requests * 64 + 32B_write_requests * 32) - - Note: TCC_EA0_* counters aggregate across all memory controllers on MI300 - TCC_BUBBLE_sum counts 128B read requests - """ - bytes_read = ( - TCC_BUBBLE_sum * 128 - + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 - + TCC_EA0_RDREQ_32B_sum * 32 - ) - bytes_written = ( - TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 - ) - return bytes_read + bytes_written - - @metric("memory.bytes_transferred_l2") - def _bytes_transferred_l2(self, TCC_REQ_sum): - """ - Total bytes transferred through L2 cache - - Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes) - """ - return TCC_REQ_sum * 128 - - @metric("memory.bytes_transferred_l1") - def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum): - """ - Total bytes transferred through L1 cache - - Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 128 (L1 cache line size is 128 bytes) - """ - return TCP_TOTAL_CACHE_ACCESSES_sum * 128 - - # Cache metrics - - @metric("memory.l2_hit_rate") - def _l2_hit_rate(self, TCC_HIT_sum, TCC_MISS_sum): - """ - L2 cache hit rate as percentage - - Formula: (hits / (hits + misses)) * 100 - """ - total = TCC_HIT_sum + TCC_MISS_sum - return (TCC_HIT_sum / total) * 100 if total > 0 else 0.0 - - @metric("memory.l1_hit_rate") - def _l1_hit_rate(self, TCP_TCC_READ_REQ_sum, TCP_TOTAL_CACHE_ACCESSES_sum): - """ - L1 cache hit rate as percentage - - Formula: ((total_accesses - l1_misses) / total_accesses) * 100 - L1 misses go to L2 (TCC), so misses = TCP_TCC_READ_REQ - """ - if TCP_TOTAL_CACHE_ACCESSES_sum == 0: - return 0.0 - - l1_hits = TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum - return (l1_hits / TCP_TOTAL_CACHE_ACCESSES_sum) * 100 - - @metric("memory.l2_bandwidth") - def _l2_bandwidth(self, TCC_HIT_sum, TCC_MISS_sum, GRBM_GUI_ACTIVE): - """ - L2 cache bandwidth in GB/s - - Formula: (total_accesses * 128 bytes) / time - Note: L2 cacheline is 128 bytes - """ - total_accesses = TCC_HIT_sum + TCC_MISS_sum - bytes_accessed = total_accesses * 128 # L2 cacheline size - - if GRBM_GUI_ACTIVE == 0: - return 0.0 - - time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6) - return (bytes_accessed / 1e9) / time_seconds if time_seconds > 0 else 0.0 - - # Coalescing metrics - - @metric("memory.coalescing_efficiency") - def _coalescing_efficiency(self, SQ_INSTS_VMEM_RD, SQ_INSTS_VMEM_WR, TCP_TOTAL_ACCESSES_sum): - """ - Memory coalescing efficiency as percentage - - Formula: (total_memory_instructions * 16 / total_cache_accesses) * 100 - - Physical meaning: - - Perfect coalescing (stride=1): 100% (minimal cache accesses) - - Poor coalescing (stride>1): 25% for float, 50% for double - - This represents actual bandwidth efficiency, not rescaled. - """ - total_instructions = SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR - - if TCP_TOTAL_ACCESSES_sum == 0: - return 0.0 - - # 16 = 64 threads per wavefront / 4 threads per cacheline - efficiency = (total_instructions * 16 / TCP_TOTAL_ACCESSES_sum) * 100 - - # Cap at 100% (can happen due to prefetching) - return min(efficiency, 100.0) - - @metric("memory.global_load_efficiency") - def _global_load_efficiency(self, SQ_INSTS_VMEM_RD, TCP_TCC_READ_REQ_sum): - """ - Global load efficiency - ratio of requested vs fetched memory - - Formula: (read_instructions * 64 bytes / read_requests * 64 bytes) * 100 - Simplifies to: (read_instructions / read_requests) * 100 - """ - if TCP_TCC_READ_REQ_sum == 0: - return 0.0 - - return min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0) - - @metric("memory.global_store_efficiency") - def _global_store_efficiency(self, SQ_INSTS_VMEM_WR, TCP_TCC_WRITE_REQ_sum): - """ - Global store efficiency - ratio of requested vs written memory - - Formula: (write_instructions / write_requests) * 100 - """ - if TCP_TCC_WRITE_REQ_sum == 0: - return 0.0 - - return min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0) - - # LDS metrics - - @metric("memory.lds_bank_conflicts") - def _lds_bank_conflicts(self, SQ_LDS_BANK_CONFLICT, SQ_INSTS_LDS): - """ - LDS bank conflicts per instruction - - Formula: total_conflicts / total_lds_instructions - """ - if SQ_INSTS_LDS == 0: - return 0.0 - - return SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS - - # Atomic metrics - - @metric("memory.atomic_latency") - def _atomic_latency(self, TCC_EA0_ATOMIC_LEVEL_sum, TCC_EA0_ATOMIC_sum): - """ - Average atomic operation latency in cycles (L2 cache atomic latency) - - Formula: TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum (MI300/MI350 counters) - - Note: This measures atomic operations to/from L2 cache, not GDS operations. - GDS (Global Data Share) is a special feature rarely used by most kernels. - """ - if TCC_EA0_ATOMIC_sum == 0: - return 0.0 - - return TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum - - # Compute metrics - - @metric("compute.total_flops") - def _total_flops( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - ): - """ - Total floating-point operations performed by the kernel - - Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA - - 64 operations per wave (wavefront size = 64) - - FMA counts as 2 operations (multiply + add) - - MFMA instructions produce 512 operations per instruction - """ - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - return fops - - @metric("compute.hbm_gflops") - def _hbm_gflops( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - ): - """ - Compute throughput (GFLOPS) using profiler kernel duration. - - Formula: (total_flops / 1e9) / (duration_us / 1e6) - Duration is set by the base class from profiler timestamps before calling. - """ - # Calculate total FLOPS (same as compute.total_flops) - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - duration_us = getattr(self, "_current_duration_us", 0.0) - if duration_us <= 0: - return 0.0 - - time_seconds = duration_us / 1e6 - gflops = (fops / 1e9) / time_seconds - - return gflops - - @metric("compute.hbm_arithmetic_intensity") - def _hbm_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCC_EA0_RDREQ_sum, - TCC_EA0_RDREQ_32B_sum, - TCC_BUBBLE_sum, - TCC_EA0_WRREQ_sum, - TCC_EA0_WRREQ_64B_sum, - ): - """ - HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte) - - Formula: total_flops / hbm_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate HBM bytes (with 32B/64B/128B distinction) - hbm_rd = ( - TCC_BUBBLE_sum * 128 - + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64 - + TCC_EA0_RDREQ_32B_sum * 32 - ) - hbm_wr = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32 - hbm_bytes = hbm_rd + hbm_wr - - # Arithmetic intensity = FLOP / byte - ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0 - - return ai_hbm - - @metric("compute.l2_arithmetic_intensity") - def _l2_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCC_REQ_sum, - ): - """ - L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte) - - Formula: total_flops / l2_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate L2 bytes (L2 cache line is 128 bytes) - l2_bytes = TCC_REQ_sum * 128 - - # Arithmetic intensity = FLOP / byte - ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0 - - return ai_l2 - - @metric("compute.l1_arithmetic_intensity") - def _l1_arithmetic_intensity( - self, - SQ_INSTS_VALU_ADD_F16, - SQ_INSTS_VALU_MUL_F16, - SQ_INSTS_VALU_TRANS_F16, - SQ_INSTS_VALU_FMA_F16, - SQ_INSTS_VALU_ADD_F32, - SQ_INSTS_VALU_MUL_F32, - SQ_INSTS_VALU_TRANS_F32, - SQ_INSTS_VALU_FMA_F32, - SQ_INSTS_VALU_ADD_F64, - SQ_INSTS_VALU_MUL_F64, - SQ_INSTS_VALU_TRANS_F64, - SQ_INSTS_VALU_FMA_F64, - SQ_INSTS_VALU_MFMA_MOPS_F16, - SQ_INSTS_VALU_MFMA_MOPS_BF16, - SQ_INSTS_VALU_MFMA_MOPS_F32, - SQ_INSTS_VALU_MFMA_MOPS_F64, - TCP_TOTAL_CACHE_ACCESSES_sum, - ): - """ - L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte) - - Formula: total_flops / l1_bytes - """ - # Calculate total FLOPS - fops = 64 * ( - ( - SQ_INSTS_VALU_ADD_F16 - + SQ_INSTS_VALU_MUL_F16 - + SQ_INSTS_VALU_TRANS_F16 - + SQ_INSTS_VALU_FMA_F16 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 - + SQ_INSTS_VALU_TRANS_F32 - + SQ_INSTS_VALU_FMA_F32 * 2 - ) - + ( - SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64 - + SQ_INSTS_VALU_TRANS_F64 - + SQ_INSTS_VALU_FMA_F64 * 2 - ) - ) + 512 * ( - SQ_INSTS_VALU_MFMA_MOPS_F16 - + SQ_INSTS_VALU_MFMA_MOPS_BF16 - + SQ_INSTS_VALU_MFMA_MOPS_F32 - + SQ_INSTS_VALU_MFMA_MOPS_F64 - ) - - # Calculate L1 bytes (L1 cache line is 128 bytes on gfx942) - l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 128 - - # Arithmetic intensity = FLOP / byte - ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0 - - return ai_l1 diff --git a/metrix/tests/unit/backends/test_backend_metrics.py b/metrix/tests/unit/backends/test_backend_metrics.py index 804747b..faf45a9 100644 --- a/metrix/tests/unit/backends/test_backend_metrics.py +++ b/metrix/tests/unit/backends/test_backend_metrics.py @@ -2,7 +2,8 @@ Unit tests for backend metric computations (gfx942 and gfx90a) Tests use MOCK counter data (no hardware counters in test code!) -Tests are parametrized to run on both MI300X (gfx942) and MI200 (gfx90a) +Tests are parametrized to run on both MI300X (gfx942) and MI200 (gfx90a). +All metrics are loaded from counter_defs.yaml. """ import pytest @@ -15,6 +16,11 @@ def backend(request): return get_backend(request.param) +def compute(backend, metric_name): + """Invoke the YAML-loaded metric compute function""" + return backend._metrics[metric_name]["compute"]() + + def get_arch_counter_names(backend, base_names): """ Map counter names based on backend architecture. @@ -24,7 +30,6 @@ def get_arch_counter_names(backend, base_names): arch = backend.device_specs.arch if arch == "gfx942": - # MI300X counter mapping mapping = { "TCC_EA_RDREQ_sum": "TCC_EA0_RDREQ_sum", "TCC_EA_RDREQ_32B_sum": "TCC_EA0_RDREQ_32B_sum", @@ -33,8 +38,7 @@ def get_arch_counter_names(backend, base_names): "TCC_EA_ATOMIC_sum": "TCC_EA0_ATOMIC_sum", "TCC_EA_ATOMIC_LEVEL_sum": "TCC_EA0_ATOMIC_LEVEL_sum", } - else: # gfx90a - # MI200 uses base names as-is + else: mapping = {} result = {} @@ -51,28 +55,28 @@ def test_perfect_hit_rate(self, backend): """100% hit rate""" backend._raw_data = {"TCC_HIT_sum": 1000, "TCC_MISS_sum": 0} - result = backend._l2_hit_rate() + result = compute(backend, "memory.l2_hit_rate") assert result == 100.0 def test_zero_hit_rate(self, backend): """0% hit rate (all misses)""" backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 1000} - result = backend._l2_hit_rate() + result = compute(backend, "memory.l2_hit_rate") assert result == 0.0 def test_fifty_percent_hit_rate(self, backend): """50% hit rate""" backend._raw_data = {"TCC_HIT_sum": 500, "TCC_MISS_sum": 500} - result = backend._l2_hit_rate() + result = compute(backend, "memory.l2_hit_rate") assert result == 50.0 def test_no_accesses(self, backend): """Handle zero total accesses""" backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 0} - result = backend._l2_hit_rate() + result = compute(backend, "memory.l2_hit_rate") assert result == 0.0 @@ -84,10 +88,10 @@ def test_perfect_coalescing(self, backend): backend._raw_data = { "SQ_INSTS_VMEM_RD": 100, "SQ_INSTS_VMEM_WR": 0, - "TCP_TOTAL_ACCESSES_sum": 1600, # 100 * 16 + "TCP_TOTAL_ACCESSES_sum": 1600, } - result = backend._coalescing_efficiency() + result = compute(backend, "memory.coalescing_efficiency") assert result == 100.0 def test_poor_coalescing(self, backend): @@ -95,10 +99,10 @@ def test_poor_coalescing(self, backend): backend._raw_data = { "SQ_INSTS_VMEM_RD": 100, "SQ_INSTS_VMEM_WR": 0, - "TCP_TOTAL_ACCESSES_sum": 6400, # 4x more accesses + "TCP_TOTAL_ACCESSES_sum": 6400, } - result = backend._coalescing_efficiency() + result = compute(backend, "memory.coalescing_efficiency") assert result == 25.0 def test_mixed_read_write(self, backend): @@ -106,10 +110,10 @@ def test_mixed_read_write(self, backend): backend._raw_data = { "SQ_INSTS_VMEM_RD": 50, "SQ_INSTS_VMEM_WR": 50, - "TCP_TOTAL_ACCESSES_sum": 1600, # (50 + 50) * 16 + "TCP_TOTAL_ACCESSES_sum": 1600, } - result = backend._coalescing_efficiency() + result = compute(backend, "memory.coalescing_efficiency") assert result == 100.0 def test_no_memory_instructions(self, backend): @@ -120,7 +124,7 @@ def test_no_memory_instructions(self, backend): "TCP_TOTAL_ACCESSES_sum": 1000, } - result = backend._coalescing_efficiency() + result = compute(backend, "memory.coalescing_efficiency") assert result == 0.0 @@ -131,21 +135,21 @@ def test_no_conflicts(self, backend): """Perfect LDS access pattern""" backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 0, "SQ_INSTS_LDS": 1000} - result = backend._lds_bank_conflicts() + result = compute(backend, "memory.lds_bank_conflicts") assert result == 0.0 def test_high_conflicts(self, backend): """2 conflicts per instruction""" backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 2000, "SQ_INSTS_LDS": 1000} - result = backend._lds_bank_conflicts() + result = compute(backend, "memory.lds_bank_conflicts") assert result == 2.0 def test_no_lds_instructions(self, backend): """Handle zero LDS instructions""" backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 100, "SQ_INSTS_LDS": 0} - result = backend._lds_bank_conflicts() + result = compute(backend, "memory.lds_bank_conflicts") assert result == 0.0 @@ -155,19 +159,17 @@ class TestBandwidthMetrics: def test_hbm_read_bandwidth_64b_only(self, backend): """Test read bandwidth with only 64B requests""" arch = backend.device_specs.arch - clock_mhz = backend.device_specs.base_clock_mhz - # Time calculation based on architecture clock speed if arch == "gfx942": - active_cycles = 2100000 # 1 ms at 2.1 GHz + active_cycles = 2100000 counters = { "TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 0, - "TCC_BUBBLE_sum": 0, # gfx942 has this counter + "TCC_BUBBLE_sum": 0, "GRBM_GUI_ACTIVE": active_cycles, } - else: # gfx90a - active_cycles = 1700000 # 1 ms at 1.7 GHz + else: + active_cycles = 1700000 counters = { "TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 0, @@ -175,8 +177,7 @@ def test_hbm_read_bandwidth_64b_only(self, backend): } backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._hbm_read_bandwidth() - # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s + result = compute(backend, "memory.hbm_read_bandwidth") assert 0.06 < result < 0.07 def test_hbm_read_bandwidth_mixed_sizes(self, backend): @@ -184,30 +185,25 @@ def test_hbm_read_bandwidth_mixed_sizes(self, backend): arch = backend.device_specs.arch if arch == "gfx942": - # MI300X with 128B bubble requests - active_cycles = 2100000 # 1 ms at 2.1 GHz + active_cycles = 2100000 counters = { "TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 200, - "TCC_BUBBLE_sum": 300, # 300 × 128B = 38400 bytes + "TCC_BUBBLE_sum": 300, "GRBM_GUI_ACTIVE": active_cycles, } - # Remaining: 1000 - 200 - 300 = 500 × 64B = 32000 bytes - # Total: 6400 + 38400 + 32000 = 76800 bytes expected_min, expected_max = 0.07, 0.08 - else: # gfx90a - # MI200 without 128B counter (all 64B or 32B) - active_cycles = 1700000 # 1 ms at 1.7 GHz + else: + active_cycles = 1700000 counters = { "TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 400, "GRBM_GUI_ACTIVE": active_cycles, } - # 400 × 32B = 12800, 600 × 64B = 38400, Total = 51200 bytes expected_min, expected_max = 0.05, 0.06 backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._hbm_read_bandwidth() + result = compute(backend, "memory.hbm_read_bandwidth") assert expected_min < result < expected_max def test_hbm_write_bandwidth_64b_only(self, backend): @@ -215,9 +211,9 @@ def test_hbm_write_bandwidth_64b_only(self, backend): arch = backend.device_specs.arch if arch == "gfx942": - active_cycles = 2100000 # 1 ms at 2.1 GHz - else: # gfx90a - active_cycles = 1700000 # 1 ms at 1.7 GHz + active_cycles = 2100000 + else: + active_cycles = 1700000 counters = { "TCC_EA_WRREQ_sum": 1000, @@ -226,8 +222,7 @@ def test_hbm_write_bandwidth_64b_only(self, backend): } backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._hbm_write_bandwidth() - # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s + result = compute(backend, "memory.hbm_write_bandwidth") assert 0.06 < result < 0.07 def test_hbm_write_bandwidth_mixed_sizes(self, backend): @@ -235,62 +230,62 @@ def test_hbm_write_bandwidth_mixed_sizes(self, backend): arch = backend.device_specs.arch if arch == "gfx942": - active_cycles = 2100000 # 1 ms at 2.1 GHz - else: # gfx90a - active_cycles = 1700000 # 1 ms at 1.7 GHz + active_cycles = 2100000 + else: + active_cycles = 1700000 counters = { "TCC_EA_WRREQ_sum": 1000, "TCC_EA_WRREQ_64B_sum": 600, "GRBM_GUI_ACTIVE": active_cycles, } - # 600 × 64B = 38400, 400 × 32B = 12800, Total = 51200 bytes backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._hbm_write_bandwidth() - # 51200 / 1e9 / 0.001 = 0.0512 GB/s + result = compute(backend, "memory.hbm_write_bandwidth") assert 0.05 < result < 0.06 def test_zero_active_cycles(self, backend): """Handle zero active cycles""" counters = {"TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 0, "GRBM_GUI_ACTIVE": 0} - # Add TCC_BUBBLE for gfx942 if backend.device_specs.arch == "gfx942": counters["TCC_BUBBLE_sum"] = 0 backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._hbm_read_bandwidth() + result = compute(backend, "memory.hbm_read_bandwidth") assert result == 0.0 class TestAtomicLatency: """Test L2 cache atomic operation latency computation""" - def test_low_latency(self, backend): + @pytest.fixture(params=["gfx942"]) + def atomic_backend(self, request): + """Only gfx942 supports atomic_latency (broken on gfx90a)""" + return get_backend(request.param) + + def test_low_latency(self, atomic_backend): """10 cycles per atomic operation""" counters = {"TCC_EA_ATOMIC_sum": 1000, "TCC_EA_ATOMIC_LEVEL_sum": 10000} - backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._atomic_latency() - # 10000 / 1000 = 10 cycles per atomic + atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters) + result = compute(atomic_backend, "memory.atomic_latency") assert result == 10.0 - def test_high_latency(self, backend): + def test_high_latency(self, atomic_backend): """1000 cycles per atomic (contention)""" counters = {"TCC_EA_ATOMIC_sum": 100, "TCC_EA_ATOMIC_LEVEL_sum": 100000} - backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._atomic_latency() - # 100000 / 100 = 1000 cycles per atomic + atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters) + result = compute(atomic_backend, "memory.atomic_latency") assert result == 1000.0 - def test_no_atomics(self, backend): + def test_no_atomics(self, atomic_backend): """Handle zero atomic instructions""" counters = {"TCC_EA_ATOMIC_sum": 0, "TCC_EA_ATOMIC_LEVEL_sum": 5000} - backend._raw_data = get_arch_counter_names(backend, counters) - result = backend._atomic_latency() + atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters) + result = compute(atomic_backend, "memory.atomic_latency") assert result == 0.0 @@ -298,29 +293,24 @@ class TestMetricDiscovery: """Test backend auto-discovers metrics""" def test_discovers_all_metrics(self, backend): - """Backend should auto-discover all @metric decorated methods""" + """Backend should discover all YAML-defined metrics""" metrics = backend.get_available_metrics() - # Should have all the metrics we defined assert "memory.l2_hit_rate" in metrics assert "memory.coalescing_efficiency" in metrics assert "memory.lds_bank_conflicts" in metrics assert "memory.hbm_read_bandwidth" in metrics - # atomic_latency is architecture-specific if backend.device_specs.arch == "gfx90a": - # On MI200, atomic_latency is unsupported (broken counter) assert "memory.atomic_latency" not in metrics assert "memory.atomic_latency" in backend._unsupported_metrics else: - # On other architectures (gfx942, etc), it's supported assert "memory.atomic_latency" in metrics def test_get_required_counters(self, backend): """Backend should correctly report required counters for a metric""" counters = backend.get_required_counters(["memory.l2_hit_rate"]) - # Should require TCC_HIT_sum and TCC_MISS_sum (counter names appear in function signature) assert "TCC_HIT_sum" in counters assert "TCC_MISS_sum" in counters assert len(counters) == 2 @@ -365,8 +355,7 @@ def test_total_flops_fp32_add(self, backend): backend._raw_data = self._get_zero_flops_counters() backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100 - result = backend._total_flops() - # 64 threads per wave * 100 instructions = 6400 FLOPS + result = compute(backend, "compute.total_flops") assert result == 6400 def test_total_flops_fma_counts_double(self, backend): @@ -374,8 +363,7 @@ def test_total_flops_fma_counts_double(self, backend): backend._raw_data = self._get_zero_flops_counters() backend._raw_data["SQ_INSTS_VALU_FMA_F32"] = 100 - result = backend._total_flops() - # 64 threads * 100 FMA * 2 ops = 12800 FLOPS + result = compute(backend, "compute.total_flops") assert result == 12800 def test_total_flops_mfma_high_throughput(self, backend): @@ -383,40 +371,39 @@ def test_total_flops_mfma_high_throughput(self, backend): backend._raw_data = self._get_zero_flops_counters() backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 10 - result = backend._total_flops() - # 512 ops * 10 instructions = 5120 FLOPS + result = compute(backend, "compute.total_flops") assert result == 5120 def test_total_flops_mixed_precision(self, backend): """Test FLOPS with mixed precision operations""" backend._raw_data = self._get_zero_flops_counters() - backend._raw_data["SQ_INSTS_VALU_ADD_F16"] = 100 # 6400 FLOPS - backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 50 # 3200 FLOPS - backend._raw_data["SQ_INSTS_VALU_ADD_F64"] = 25 # 1600 FLOPS + backend._raw_data["SQ_INSTS_VALU_ADD_F16"] = 100 + backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 50 + backend._raw_data["SQ_INSTS_VALU_ADD_F64"] = 25 - result = backend._total_flops() + result = compute(backend, "compute.total_flops") assert result == 6400 + 3200 + 1600 def test_total_flops_zero(self, backend): """Handle zero FLOPS gracefully""" backend._raw_data = self._get_zero_flops_counters() - result = backend._total_flops() + result = compute(backend, "compute.total_flops") assert result == 0 def test_hbm_gflops_zero_time(self, backend): - """Handle zero active cycles""" + """Handle zero duration""" backend._raw_data = self._get_zero_flops_counters() backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 - backend._raw_data["GRBM_GUI_ACTIVE"] = 0 + backend._current_duration_us = 0.0 - result = backend._hbm_gflops() + result = compute(backend, "compute.hbm_gflops") assert result == 0.0 def test_hbm_arithmetic_intensity(self, backend): """Test HBM arithmetic intensity calculation""" backend._raw_data = self._get_zero_flops_counters() - backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 # 64000 FLOPS + backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 counters = { "TCC_EA_RDREQ_sum": 1000, @@ -425,14 +412,12 @@ def test_hbm_arithmetic_intensity(self, backend): "TCC_EA_WRREQ_64B_sum": 0, } - # Add TCC_BUBBLE for gfx942 if backend.device_specs.arch == "gfx942": counters["TCC_BUBBLE_sum"] = 0 backend._raw_data.update(get_arch_counter_names(backend, counters)) - result = backend._hbm_arithmetic_intensity() - # 64000 FLOPS / (1000 * 64 bytes) = 64000 / 64000 = 1.0 FLOP/byte + result = compute(backend, "compute.hbm_arithmetic_intensity") assert result == 1.0 def test_hbm_arithmetic_intensity_zero_bytes(self, backend): @@ -452,17 +437,16 @@ def test_hbm_arithmetic_intensity_zero_bytes(self, backend): backend._raw_data.update(get_arch_counter_names(backend, counters)) - result = backend._hbm_arithmetic_intensity() + result = compute(backend, "compute.hbm_arithmetic_intensity") assert result == 0.0 def test_l2_arithmetic_intensity(self, backend): """Test L2 arithmetic intensity calculation""" backend._raw_data = self._get_zero_flops_counters() - backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 # 64000 FLOPS - backend._raw_data["TCC_REQ_sum"] = 500 # 500 * 128 = 64000 bytes + backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 + backend._raw_data["TCC_REQ_sum"] = 500 - result = backend._l2_arithmetic_intensity() - # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte + result = compute(backend, "compute.l2_arithmetic_intensity") assert result == 1.0 def test_l2_arithmetic_intensity_zero_bytes(self, backend): @@ -471,24 +455,20 @@ def test_l2_arithmetic_intensity_zero_bytes(self, backend): backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 backend._raw_data["TCC_REQ_sum"] = 0 - result = backend._l2_arithmetic_intensity() + result = compute(backend, "compute.l2_arithmetic_intensity") assert result == 0.0 def test_l1_arithmetic_intensity(self, backend): """Test L1 arithmetic intensity calculation""" backend._raw_data = self._get_zero_flops_counters() - backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 # 64000 FLOPS + backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 - # L1 cache line size differs by architecture: - # gfx942 (MI300X): 128 bytes - # gfx90a (MI200): 64 bytes if backend.device_specs.arch == "gfx942": - backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 500 # 500 * 128 = 64000 bytes - else: # gfx90a - backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 1000 # 1000 * 64 = 64000 bytes + backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 500 + else: + backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 1000 - result = backend._l1_arithmetic_intensity() - # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte + result = compute(backend, "compute.l1_arithmetic_intensity") assert result == 1.0 def test_l1_arithmetic_intensity_zero_bytes(self, backend): @@ -497,14 +477,13 @@ def test_l1_arithmetic_intensity_zero_bytes(self, backend): backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000 backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 0 - result = backend._l1_arithmetic_intensity() + result = compute(backend, "compute.l1_arithmetic_intensity") assert result == 0.0 def test_high_arithmetic_intensity_compute_bound(self, backend): """Test high AI indicates compute-bound kernel""" backend._raw_data = self._get_zero_flops_counters() - # Lots of compute, little memory - backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 1000 # 512000 FLOPS + backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 1000 counters = { "TCC_EA_RDREQ_sum": 100, @@ -518,15 +497,13 @@ def test_high_arithmetic_intensity_compute_bound(self, backend): backend._raw_data.update(get_arch_counter_names(backend, counters)) - result = backend._hbm_arithmetic_intensity() - # 512000 / 6400 = 80 FLOP/byte (very compute-bound) + result = compute(backend, "compute.hbm_arithmetic_intensity") assert result == 80.0 def test_low_arithmetic_intensity_memory_bound(self, backend): """Test low AI indicates memory-bound kernel""" backend._raw_data = self._get_zero_flops_counters() - # Little compute, lots of memory - backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100 # 6400 FLOPS + backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100 counters = { "TCC_EA_RDREQ_sum": 10000, @@ -540,6 +517,5 @@ def test_low_arithmetic_intensity_memory_bound(self, backend): backend._raw_data.update(get_arch_counter_names(backend, counters)) - result = backend._hbm_arithmetic_intensity() - # 6400 / 640000 = 0.01 FLOP/byte (very memory-bound) + result = compute(backend, "compute.hbm_arithmetic_intensity") assert result == 0.01 diff --git a/metrix/tests/unit/test_error_handling.py b/metrix/tests/unit/test_error_handling.py index 91dec21..36a3bc8 100644 --- a/metrix/tests/unit/test_error_handling.py +++ b/metrix/tests/unit/test_error_handling.py @@ -174,7 +174,7 @@ def test_division_by_zero_handling(self, arch): backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 0} # Should return 0.0, not raise ZeroDivisionError - result = backend._l2_hit_rate() + result = backend._metrics["memory.l2_hit_rate"]["compute"]() assert result == 0.0 @pytest.mark.parametrize("arch", ["gfx942", "gfx90a"]) @@ -189,5 +189,5 @@ def test_negative_values_handling(self, arch): } # Should not crash - result = backend._l2_hit_rate() + result = backend._metrics["memory.l2_hit_rate"]["compute"]() assert isinstance(result, (int, float))