diff --git a/metrix/src/metrix/backends/base.py b/metrix/src/metrix/backends/base.py
index 65c8cc5..aee27fb 100644
--- a/metrix/src/metrix/backends/base.py
+++ b/metrix/src/metrix/backends/base.py
@@ -26,15 +26,9 @@ class DeviceSpecs:
 
     # Memory specs
     hbm_bandwidth_gbs: float = 0.0
-    l2_bandwidth_gbs: float = 0.0
     l2_size_mb: float = 0.0
     lds_size_per_cu_kb: float = 0.0
 
-    # Compute capabilities
-    fp32_tflops: float = 0.0
-    fp64_tflops: float = 0.0
-    int8_tops: float = 0.0
-
     # Clock speeds
     boost_clock_mhz: int = 0
 
@@ -144,6 +138,7 @@ def _load_yaml_metrics_if_available(self):
 
         # Parse YAML and collect metrics matching this architecture first
         yaml_metrics = {}
+        yaml_unsupported = {}
         counters_section = yaml_data["rocprofiler-sdk"].get("counters", [])
 
         for counter_def in counters_section:
@@ -164,13 +159,16 @@ def _load_yaml_metrics_if_available(self):
             if definition is None:
                 continue
 
+            # Check if this metric is marked unsupported for this architecture
+            unsupported_reason = definition.get("unsupported_reason")
+            if unsupported_reason:
+                yaml_unsupported[counter_name] = unsupported_reason
+                continue
+
             # Register counters: derived, reduce(), and built-in
             if "expression" in definition:
                 expression = definition["expression"]
 
-                # Check if this is a simple reduce() expression
-                import re
-
                 reduce_match = re.match(
                     r"^reduce\([A-Z_0-9]+,\s*(?:sum|max|min)\)$", expression.strip()
                 )
@@ -193,15 +191,31 @@ def _load_yaml_metrics_if_available(self):
                     "compute": lambda cn=counter_name: self._raw_data.get(cn, 0.0),
                 }
 
-        if not yaml_metrics:
+        if not yaml_metrics and not yaml_unsupported:
             return
 
         # YAML metrics found for this arch -- replace @metric-based metrics
         self._metrics.clear()
         self._unsupported_metrics.clear()
         self._metrics.update(yaml_metrics)
+        self._unsupported_metrics.update(yaml_unsupported)
         print(f"✓ Loaded {len(self._metrics)} YAML-based metrics for {arch}")
 
+    @property
+    def _builtin_expression_vars(self) -> set:
+        """Variables injected into YAML expression namespace (not hardware counters).
+
+        Derived from DeviceSpecs fields + DURATION_US so it stays in sync
+        automatically when new spec fields are added.
+        """
+        import dataclasses
+
+        names = {f.name.upper() for f in dataclasses.fields(self.device_specs)}
+        names.discard("ARCH")
+        names.discard("NAME")
+        names.add("DURATION_US")
+        return names
+
     def _extract_counters_from_expression(self, expression: str) -> List[str]:
         """Extract counter names from YAML expression"""
         import re
@@ -212,10 +226,10 @@ def _extract_counters_from_expression(self, expression: str) -> List[str]:
         for match in re.finditer(r"reduce\(([A-Z_0-9]+),\s*(?:sum|max|min)\)", expression):
             counters.add(match.group(1))
 
-        # Extract standalone counter names
+        # Extract standalone counter names (uppercase identifiers)
         for match in re.finditer(r"\b([A-Z][A-Z_0-9]*(?:_sum)?)\b", expression):
             counter_name = match.group(1)
-            if counter_name not in ["CU_NUM"]:
+            if counter_name not in self._builtin_expression_vars:
                 counters.add(counter_name)
 
         return sorted(list(counters))
@@ -225,8 +239,15 @@ def _create_yaml_compute_function(self, expression: str, metric_name: str):
         import re
 
         def compute():
+            import dataclasses
+
             namespace = dict(self._raw_data)
-            namespace["CU_NUM"] = self.device_specs.num_cu
+
+            # Inject all DeviceSpecs fields as UPPER_CASE variables
+            for f in dataclasses.fields(self.device_specs):
+                if f.name not in ("arch", "name"):
+                    namespace[f.name.upper()] = getattr(self.device_specs, f.name)
+            namespace["DURATION_US"] = getattr(self, "_current_duration_us", 0.0)
 
             # Replace reduce(X, op) with X_op
             processed_expr = re.sub(
diff --git a/metrix/src/metrix/backends/counter_defs.yaml b/metrix/src/metrix/backends/counter_defs.yaml
index dd08279..ba9222b 100644
--- a/metrix/src/metrix/backends/counter_defs.yaml
+++ b/metrix/src/metrix/backends/counter_defs.yaml
@@ -1,14 +1,278 @@
-# Public counter definitions for IntelliKit metrix
-# These use counters available via rocprofv3 --list-avail (no proprietary data)
+# Counter definitions for IntelliKit metrix
+# Single source of truth for all architecture metric definitions.
+# Uses counters available via rocprofv3 --list-avail (no proprietary data).
+#
+# Built-in variables available in expressions (from DeviceSpecs):
+#   NUM_CU, MAX_WAVES_PER_CU, WAVEFRONT_SIZE, BASE_CLOCK_MHZ,
+#   HBM_BANDWIDTH_GBS, L2_SIZE_MB, LDS_SIZE_PER_CU_KB,
+#   BOOST_CLOCK_MHZ, DURATION_US
 
 rocprofiler-sdk:
   counters:
-    - name: GPU_UTILIZATION
+
+    # --- GPU utilization ---
+
+    - name: compute.gpu_utilization
       definitions:
         - expression: (GRBM_GUI_ACTIVE / GRBM_COUNT) * 100
           architectures: [gfx1201, gfx1151]
 
-    - name: L2_HIT_RATE
+    # --- Cache hit rates ---
+
+    - name: memory.l2_hit_rate
       definitions:
+        - expression: (TCC_HIT_sum / (TCC_HIT_sum + TCC_MISS_sum)) * 100
+          architectures: [gfx942, gfx90a]
         - expression: (GL2C_HIT_sum / (GL2C_HIT_sum + GL2C_MISS_sum)) * 100
           architectures: [gfx1201, gfx1151]
+
+    - name: memory.l1_hit_rate
+      definitions:
+        - expression: >-
+            ((TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum)
+            / TCP_TOTAL_CACHE_ACCESSES_sum) * 100
+          architectures: [gfx942, gfx90a]
+
+    # --- L2 bandwidth ---
+
+    - name: memory.l2_bandwidth
+      definitions:
+        - expression: >-
+            (((TCC_HIT_sum + TCC_MISS_sum) * 128) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+          architectures: [gfx942, gfx90a]
+
+    # --- Bytes transferred ---
+
+    - name: memory.bytes_transferred_l2
+      definitions:
+        - expression: TCC_REQ_sum * 128
+          architectures: [gfx942, gfx90a]
+
+    - name: memory.bytes_transferred_l1
+      definitions:
+        - expression: TCP_TOTAL_CACHE_ACCESSES_sum * 128
+          architectures: [gfx942]
+        - expression: TCP_TOTAL_CACHE_ACCESSES_sum * 64
+          architectures: [gfx90a]
+
+    # --- Coalescing and efficiency ---
+
+    - name: memory.coalescing_efficiency
+      definitions:
+        - expression: >-
+            min(((SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR) * 16
+            / TCP_TOTAL_ACCESSES_sum) * 100, 100.0)
+          architectures: [gfx942, gfx90a]
+
+    - name: memory.global_load_efficiency
+      definitions:
+        - expression: >-
+            min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0)
+          architectures: [gfx942, gfx90a]
+
+    - name: memory.global_store_efficiency
+      definitions:
+        - expression: >-
+            min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0)
+          architectures: [gfx942, gfx90a]
+
+    # --- LDS ---
+
+    - name: memory.lds_bank_conflicts
+      definitions:
+        - expression: SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS
+          architectures: [gfx942, gfx90a]
+
+    # --- HBM read bandwidth ---
+    # gfx942 has TCC_BUBBLE_sum for 128B reads; gfx90a does not
+
+    - name: memory.hbm_read_bandwidth
+      definitions:
+        - expression: >-
+            ((TCC_BUBBLE_sum * 128
+            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+            + TCC_EA0_RDREQ_32B_sum * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+          architectures: [gfx942]
+        - expression: >-
+            (((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+            + TCC_EA_RDREQ_32B_sum * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+          architectures: [gfx90a]
+
+    # --- HBM write bandwidth ---
+
+    - name: memory.hbm_write_bandwidth
+      definitions:
+        - expression: >-
+            ((TCC_EA0_WRREQ_64B_sum * 64
+            + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+          architectures: [gfx942]
+        - expression: >-
+            ((TCC_EA_WRREQ_64B_sum * 64
+            + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+          architectures: [gfx90a]
+
+    # --- HBM bandwidth utilization ---
+
+    - name: memory.hbm_bandwidth_utilization
+      definitions:
+        - expression: >-
+            ((TCC_BUBBLE_sum * 128
+            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+            + TCC_EA0_RDREQ_32B_sum * 32
+            + TCC_EA0_WRREQ_64B_sum * 64
+            + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+            / HBM_BANDWIDTH_GBS * 100
+          architectures: [gfx942]
+        - expression: >-
+            (((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+            + TCC_EA_RDREQ_32B_sum * 32
+            + TCC_EA_WRREQ_64B_sum * 64
+            + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) / 1e9)
+            / (GRBM_GUI_ACTIVE / (BASE_CLOCK_MHZ * 1e6))
+            / HBM_BANDWIDTH_GBS * 100
+          architectures: [gfx90a]
+
+    # --- Total HBM bytes ---
+
+    - name: memory.bytes_transferred_hbm
+      definitions:
+        - expression: >-
+            TCC_BUBBLE_sum * 128
+            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+            + TCC_EA0_RDREQ_32B_sum * 32
+            + TCC_EA0_WRREQ_64B_sum * 64
+            + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
+          architectures: [gfx942]
+        - expression: >-
+            (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+            + TCC_EA_RDREQ_32B_sum * 32
+            + TCC_EA_WRREQ_64B_sum * 64
+            + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
+          architectures: [gfx90a]
+
+    # --- Atomic latency ---
+
+    - name: memory.atomic_latency
+      definitions:
+        - expression: TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum
+          architectures: [gfx942]
+        - unsupported_reason: >-
+            TCC_EA_ATOMIC_LEVEL_sum counter is broken on MI200 (gfx90a).
+            This metric only works correctly on MI300X (gfx942) and newer GPUs.
+          architectures: [gfx90a]
+
+    # --- Compute: total FLOPS ---
+
+    - name: compute.total_flops
+      definitions:
+        - expression: >-
+            64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64)
+          architectures: [gfx942, gfx90a]
+
+    # --- Compute: GFLOPS ---
+
+    - name: compute.hbm_gflops
+      definitions:
+        - expression: >-
+            ((64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / 1e9) / (DURATION_US / 1e6)
+            if DURATION_US > 0 else 0.0
+          architectures: [gfx942, gfx90a]
+
+    # --- Compute: HBM arithmetic intensity ---
+
+    - name: compute.hbm_arithmetic_intensity
+      definitions:
+        - expression: >-
+            (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / (TCC_BUBBLE_sum * 128
+            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
+            + TCC_EA0_RDREQ_32B_sum * 32
+            + TCC_EA0_WRREQ_64B_sum * 64
+            + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)
+          architectures: [gfx942]
+        - expression: >-
+            (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
+            + TCC_EA_RDREQ_32B_sum * 32
+            + TCC_EA_WRREQ_64B_sum * 64
+            + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
+          architectures: [gfx90a]
+
+    # --- Compute: L2 arithmetic intensity ---
+
+    - name: compute.l2_arithmetic_intensity
+      definitions:
+        - expression: >-
+            (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / (TCC_REQ_sum * 128)
+          architectures: [gfx942, gfx90a]
+
+    # --- Compute: L1 arithmetic intensity ---
+    # L1 cache line: 128 bytes on gfx942, 64 bytes on gfx90a
+
+    - name: compute.l1_arithmetic_intensity
+      definitions:
+        - expression: >-
+            (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / (TCP_TOTAL_CACHE_ACCESSES_sum * 128)
+          architectures: [gfx942]
+        - expression: >-
+            (64 * ((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + SQ_INSTS_VALU_TRANS_F16 + SQ_INSTS_VALU_FMA_F16 * 2)
+            + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + SQ_INSTS_VALU_TRANS_F32 + SQ_INSTS_VALU_FMA_F32 * 2)
+            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + SQ_INSTS_VALU_TRANS_F64 + SQ_INSTS_VALU_FMA_F64 * 2))
+            + 512 * (SQ_INSTS_VALU_MFMA_MOPS_F16 + SQ_INSTS_VALU_MFMA_MOPS_BF16
+            + SQ_INSTS_VALU_MFMA_MOPS_F32 + SQ_INSTS_VALU_MFMA_MOPS_F64))
+            / (TCP_TOTAL_CACHE_ACCESSES_sum * 64)
+          architectures: [gfx90a]
diff --git a/metrix/src/metrix/backends/device_info.py b/metrix/src/metrix/backends/device_info.py
new file mode 100644
index 0000000..ddb56ca
--- /dev/null
+++ b/metrix/src/metrix/backends/device_info.py
@@ -0,0 +1,295 @@
+"""
+Dynamic GPU device info from rocminfo / rocm-smi + peak spec lookup table.
+
+Queryable values (num_cu, wavefront_size, etc.) are read from the live
+system so that one backend class works for every SKU in an architecture
+family (e.g. MI210 vs MI250X both use gfx90a).
+
+Theoretical peak values (TFLOPS, HBM bandwidth) that cannot be read from
+hardware are stored in a small per-chip-ID table with source links.
+
+When the requested arch does not match the GPU actually installed (e.g.
+unit tests creating a gfx942 backend on an MI210 machine), static
+fallback specs are used instead.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict
+
+if TYPE_CHECKING:
+    from .base import DeviceSpecs
+
+
+# ---------------------------------------------------------------------------
+# Peak specs that cannot be queried from hardware.
+# Keyed by (gfx_arch, chip_id_hex) so different SKUs within the same arch
+# get correct values.  chip_id_hex = None acts as the arch-level default.
+#
+# Sources are listed per-entry so they can be verified / updated.
+# ---------------------------------------------------------------------------
+# ---------------------------------------------------------------------------
+# Fallback specs used when the requested arch differs from the installed GPU
+# (e.g. unit tests) or when rocminfo is unavailable.
+#
+# Sources:
+#   HW specs:     https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
+#   MI300X peaks: https://www.amd.com/en/products/accelerators/instinct/mi300/platform.html
+#   MI210 peaks:  https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html
+# ---------------------------------------------------------------------------
+def _fallback_specs() -> Dict[str, "DeviceSpecs"]:
+    from .base import DeviceSpecs
+
+    return {
+        "gfx942": DeviceSpecs(
+            arch="gfx942",
+            name="AMD Instinct MI300X",
+            num_cu=304,
+            max_waves_per_cu=32,
+            wavefront_size=64,
+            base_clock_mhz=2100.0,
+            hbm_bandwidth_gbs=5300.0,
+            l2_size_mb=256.0,
+            lds_size_per_cu_kb=64.0,
+            boost_clock_mhz=2100,
+        ),
+        "gfx90a": DeviceSpecs(
+            arch="gfx90a",
+            name="AMD Instinct MI210",
+            num_cu=104,
+            max_waves_per_cu=32,
+            wavefront_size=64,
+            base_clock_mhz=1700.0,
+            hbm_bandwidth_gbs=1600.0,
+            l2_size_mb=8.0,
+            lds_size_per_cu_kb=64.0,
+            boost_clock_mhz=1600,
+        ),
+    }
+
+
+# HBM peak bandwidth per arch (only value not queryable from hardware)
+_HBM_PEAK_GBS: Dict[str, float] = {
+    "gfx942": 5300.0,
+    "gfx90a": 1600.0,
+}
+
+
+# ---------------------------------------------------------------------------
+# rocminfo parser — one call, structured results for the first GPU agent
+# ---------------------------------------------------------------------------
+@dataclass
+class RocmInfoGPU:
+    """Parsed GPU agent block from rocminfo."""
+
+    arch: str = ""
+    marketing_name: str = ""
+    chip_id_hex: str = ""
+    num_cu: int = 0
+    simds_per_cu: int = 0
+    max_waves_per_cu: int = 0
+    wavefront_size: int = 64
+    max_clock_mhz: int = 0
+    l1_cache_kb: int = 0
+    l2_cache_kb: int = 0
+    lds_size_kb: int = 0
+
+
+def _parse_rocminfo() -> RocmInfoGPU:
+    """Run ``rocminfo`` and parse the first GPU agent."""
+    try:
+        proc = subprocess.run(["rocminfo"], capture_output=True, text=True, timeout=10)
+    except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
+        raise RuntimeError(f"rocminfo unavailable: {exc}") from exc
+
+    if proc.returncode != 0:
+        raise RuntimeError(f"rocminfo failed (rc={proc.returncode}): {proc.stderr}")
+
+    gpu = RocmInfoGPU()
+    in_gpu_agent = False
+    in_cache = False
+    found_group_segment = False
+
+    pending_agent_name = ""
+    pending_marketing_name = ""
+
+    for line in proc.stdout.splitlines():
+        stripped = line.strip()
+
+        if stripped.startswith("*******"):
+            if in_gpu_agent:
+                break
+            pending_agent_name = ""
+            pending_marketing_name = ""
+            continue
+
+        if stripped.startswith("Name:") and not in_gpu_agent:
+            pending_agent_name = stripped.split(":", 1)[1].strip()
+            continue
+
+        if stripped.startswith("Marketing Name:") and not in_gpu_agent:
+            pending_marketing_name = stripped.split(":", 1)[1].strip()
+            continue
+
+        if "Device Type:" in stripped and "GPU" in stripped:
+            in_gpu_agent = True
+            in_cache = False
+            m = re.search(r"(gfx\w+)", pending_agent_name)
+            if m:
+                gpu.arch = m.group(1)
+            gpu.marketing_name = pending_marketing_name
+            continue
+
+        if not in_gpu_agent:
+            continue
+
+        if "Device Type:" in stripped and "GPU" not in stripped:
+            break
+
+        if "Cache Info:" in stripped:
+            in_cache = True
+            continue
+        if "Pool Info:" in stripped or "ISA Info:" in stripped:
+            in_cache = False
+
+        if in_cache:
+            m = re.match(r"L1:\s+(\d+)", stripped)
+            if m:
+                gpu.l1_cache_kb = int(m.group(1))
+            m = re.match(r"L2:\s+(\d+)", stripped)
+            if m:
+                gpu.l2_cache_kb = int(m.group(1))
+
+        if stripped.startswith("Chip ID:"):
+            m = re.search(r"\((0x[0-9a-fA-F]+)\)", stripped)
+            if m:
+                gpu.chip_id_hex = m.group(1).lower()
+        elif stripped.startswith("Compute Unit:"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.num_cu = int(m.group(1))
+        elif stripped.startswith("SIMDs per CU:"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.simds_per_cu = int(m.group(1))
+        elif stripped.startswith("Max Waves Per CU:"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.max_waves_per_cu = int(m.group(1))
+        elif stripped.startswith("Wavefront Size:"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.wavefront_size = int(m.group(1))
+        elif stripped.startswith("Max Clock Freq"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.max_clock_mhz = int(m.group(1))
+        elif "Segment:" in stripped and "GROUP" in stripped:
+            found_group_segment = True
+        elif found_group_segment and stripped.startswith("Size:"):
+            m = re.search(r"(\d+)", stripped.split(":")[1])
+            if m:
+                gpu.lds_size_kb = int(m.group(1))
+            found_group_segment = False
+
+    return gpu
+
+
+def _parse_rocm_smi_json() -> dict:
+    """Run ``rocm-smi --json`` with relevant flags and return first card."""
+    try:
+        proc = subprocess.run(
+            [
+                "rocm-smi",
+                "--showproductname",
+                "--showsclkrange",
+                "--showmclkrange",
+                "--showmeminfo",
+                "all",
+                "--json",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return {}
+
+    if proc.returncode != 0:
+        return {}
+
+    import json
+
+    try:
+        data = json.loads(proc.stdout)
+    except json.JSONDecodeError:
+        return {}
+
+    for key in sorted(data.keys()):
+        if key.startswith("card"):
+            return data[key]
+    return {}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def query_device_specs(arch: str) -> "DeviceSpecs":
+    """
+    Build a DeviceSpecs by querying rocminfo/rocm-smi for live values.
+
+    If the requested *arch* does not match the GPU actually installed
+    (common in unit tests), a static fallback is returned instead.
+
+    Args:
+        arch: GFX architecture string (e.g. "gfx90a", "gfx942")
+
+    Returns:
+        Fully populated DeviceSpecs
+    """
+    from .base import DeviceSpecs
+
+    # Try live query
+    hw_arch = None
+    try:
+        gpu = _parse_rocminfo()
+        hw_arch = gpu.arch or None
+    except RuntimeError:
+        gpu = None
+
+    # If the hardware matches the requested arch, use live values
+    if gpu and hw_arch == arch:
+        smi = _parse_rocm_smi_json()
+
+        boost_clock_mhz = 0
+        mclk_range = smi.get("Valid mclk range", "")
+        m = re.search(r"(\d+)\s*Mhz\s*$", mclk_range)
+        if m:
+            boost_clock_mhz = int(m.group(1))
+
+        return DeviceSpecs(
+            arch=arch,
+            name=gpu.marketing_name or f"AMD GPU ({arch})",
+            num_cu=gpu.num_cu,
+            max_waves_per_cu=gpu.max_waves_per_cu,
+            wavefront_size=gpu.wavefront_size,
+            base_clock_mhz=float(gpu.max_clock_mhz),
+            hbm_bandwidth_gbs=_HBM_PEAK_GBS.get(arch, 0.0),
+            l2_size_mb=gpu.l2_cache_kb / 1024.0,
+            lds_size_per_cu_kb=float(gpu.lds_size_kb or 64),
+            boost_clock_mhz=boost_clock_mhz or gpu.max_clock_mhz,
+        )
+
+    # Arch mismatch or rocminfo unavailable — use static fallback
+    fallback = _fallback_specs()
+    if arch in fallback:
+        return fallback[arch]
+
+    return DeviceSpecs(
+        arch=arch,
+        name=f"AMD GPU ({arch})",
+        hbm_bandwidth_gbs=_HBM_PEAK_GBS.get(arch, 0.0),
+    )
diff --git a/metrix/src/metrix/backends/gfx1151.py b/metrix/src/metrix/backends/gfx1151.py
index 048c12d..f4e80e3 100644
--- a/metrix/src/metrix/backends/gfx1151.py
+++ b/metrix/src/metrix/backends/gfx1151.py
@@ -4,16 +4,12 @@
 Shares all hardware configuration with gfx1201 (RDNA4).
 """
 
-from .base import DeviceSpecs
+from .device_info import query_device_specs
 from .gfx1201 import GFX1201Backend
 
 
 class GFX1151Backend(GFX1201Backend):
     """AMD RDNA4 (GFX1151) backend - same hardware config as gfx1201."""
 
-    def _get_device_specs(self) -> DeviceSpecs:
-        return DeviceSpecs(
-            arch="gfx1151",
-            name="AMD Radeon Graphics (RDNA4)",
-            wavefront_size=32,
-        )
+    def _get_device_specs(self):
+        return query_device_specs("gfx1151")
diff --git a/metrix/src/metrix/backends/gfx1201.py b/metrix/src/metrix/backends/gfx1201.py
index ee477af..7039de2 100644
--- a/metrix/src/metrix/backends/gfx1201.py
+++ b/metrix/src/metrix/backends/gfx1201.py
@@ -2,9 +2,12 @@
 GFX1201 (RDNA4) Backend
 
 Metrics are loaded from counter_defs.yaml.
+This file provides architecture-specific infrastructure only.
+Device specs are queried from rocminfo / rocm-smi at runtime.
 """
 
-from .base import CounterBackend, DeviceSpecs, ProfileResult, Statistics
+from .base import CounterBackend, ProfileResult
+from .device_info import query_device_specs
 from ..profiler.rocprof_wrapper import ROCProfV3Wrapper
 from pathlib import Path
 from typing import List, Optional
@@ -13,42 +16,8 @@
 class GFX1201Backend(CounterBackend):
     """AMD RDNA4 (gfx1201) backend."""
 
-    def get_metric_counters(self, metric: str) -> List[str]:
-        if metric not in self._metrics:
-            return [metric]
-        return list(self._metrics[metric]["counters"])
-
-    def get_required_counters(self, metrics: List[str]) -> List[str]:
-        counters = set()
-        skip = {"duration_us"}
-        for metric in metrics:
-            if metric not in self._metrics:
-                counters.add(metric)
-            else:
-                counters.update(c for c in self._metrics[metric]["counters"] if c not in skip)
-        return list(counters)
-
-    def compute_metric_stats(self, dispatch_key: str, metric: str) -> Statistics:
-        if dispatch_key not in self._aggregated:
-            raise KeyError(f"Unknown dispatch key: {dispatch_key}")
-        counter_stats = self._aggregated[dispatch_key]
-        if metric not in self._metrics:
-            if metric in counter_stats:
-                return counter_stats[metric]
-            return Statistics(min=0.0, max=0.0, avg=0.0, count=0)
-        metric_min = self._compute_with_stat_type(metric, counter_stats, "min")
-        metric_max = self._compute_with_stat_type(metric, counter_stats, "max")
-        metric_avg = self._compute_with_stat_type(metric, counter_stats, "avg")
-        first_counter = list(counter_stats.keys())[0]
-        count = counter_stats[first_counter].count
-        return Statistics(min=metric_min, max=metric_max, avg=metric_avg, count=count)
-
-    def _get_device_specs(self) -> DeviceSpecs:
-        return DeviceSpecs(
-            arch="gfx1201",
-            name="AMD Radeon Graphics (RDNA4)",
-            wavefront_size=32,
-        )
+    def _get_device_specs(self):
+        return query_device_specs("gfx1201")
 
     def _run_rocprof(
         self,
diff --git a/metrix/src/metrix/backends/gfx90a.py b/metrix/src/metrix/backends/gfx90a.py
index bc38992..d254155 100644
--- a/metrix/src/metrix/backends/gfx90a.py
+++ b/metrix/src/metrix/backends/gfx90a.py
@@ -1,13 +1,14 @@
 """
 GFX90a (MI200) Backend
 
-Each metric is defined with @metric decorator.
-Counter names appear EXACTLY ONCE - as function parameters.
+Metrics are loaded from counter_defs.yaml.
+This file provides architecture-specific infrastructure only.
+Device specs are queried from rocminfo / rocm-smi at runtime.
 """
 
 from .base import CounterBackend, DeviceSpecs, ProfileResult
+from .device_info import query_device_specs
 from ..utils.common import split_counters_into_passes
-from .decorator import metric
 from ..profiler.rocprof_wrapper import ROCProfV3Wrapper
 from typing import List, Optional, Dict
 
@@ -16,28 +17,11 @@ class GFX90aBackend(CounterBackend):
     """
     AMD MI200 (gfx90a) counter backend
 
-    All metrics are defined with @metric decorator.
-    Hardware counter names appear ONLY as function parameter names.
+    Metric definitions live in counter_defs.yaml.
     """
 
     def _get_device_specs(self) -> DeviceSpecs:
-        """MI200 specifications"""
-        return DeviceSpecs(
-            arch="gfx90a",
-            name="AMD Instinct MI200",
-            num_cu=104,
-            max_waves_per_cu=32,
-            wavefront_size=64,
-            base_clock_mhz=1700.0,
-            hbm_bandwidth_gbs=3200.0,
-            l2_bandwidth_gbs=11000.0,
-            l2_size_mb=16.0,
-            lds_size_per_cu_kb=64.0,
-            fp32_tflops=47.9,
-            fp64_tflops=47.9,
-            int8_tops=383,
-            boost_clock_mhz=1600,
-        )
+        return query_device_specs("gfx90a")
 
     def _get_counter_groups(self, counters: List[str]) -> List[List[str]]:
         """
@@ -63,33 +47,18 @@ def _get_counter_block_limits(self) -> Dict[str, int]:
 
         These limits define how many performance counters can be simultaneously
         collected from each hardware block in a single profiling pass.
-
-        Hardware blocks on MI200:
-        - SQ (Shader): Instruction counters (VALU, LDS, VMEM, etc.)
-        - TA (Texture Addresser): Texture address operations
-        - TD (Texture Data): Texture data fetch operations
-        - TCP (Texture Cache per Pipe): L1 vector cache
-        - TCC (Texture Cache Channel): L2 cache and memory controller
-        - CPC (Command Processor - Compute): Compute command processing
-        - CPF (Command Processor - Fetch): Command fetch operations
-        - SPI (Shader Processor Input): Wavefront dispatch and scheduling
-        - GRBM (Graphics Register Bus Manager): Global GPU activity
-        - GDS (Global Data Share): Inter-workgroup communication
-
-        Returns:
-            Dict mapping block_name -> max_counters_per_pass
         """
         return {
-            "SQ": 8,  # Shader - instruction counters
-            "TA": 2,  # Texture Addresser
-            "TD": 2,  # Texture Data
-            "TCP": 4,  # L1 Cache (Texture Cache per Pipe)
-            "TCC": 4,  # L2 Cache / Memory Controller
-            "CPC": 2,  # Command Processor - Compute
-            "CPF": 2,  # Command Processor - Fetch
-            "SPI": 6,  # Shader Processor Input
-            "GRBM": 2,  # Graphics Register Bus Manager
-            "GDS": 4,  # Global Data Share
+            "SQ": 8,
+            "TA": 2,
+            "TD": 2,
+            "TCP": 4,
+            "TCC": 4,
+            "CPC": 2,
+            "CPF": 2,
+            "SPI": 6,
+            "GRBM": 2,
+            "GDS": 4,
         }
 
     def _run_rocprof(
@@ -103,537 +72,3 @@ def _run_rocprof(
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
         return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
-
-    # Memory bandwidth metrics
-
-    @metric("memory.hbm_read_bandwidth")
-    def _hbm_read_bandwidth(self, TCC_EA_RDREQ_sum, TCC_EA_RDREQ_32B_sum, GRBM_GUI_ACTIVE):
-        """
-        HBM read bandwidth in GB/s
-
-        Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
-        """
-        # Calculate bytes with 32B/64B distinction
-        bytes_read_64B = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64
-        bytes_read_32B = TCC_EA_RDREQ_32B_sum * 32
-        bytes_read = bytes_read_64B + bytes_read_32B
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    @metric("memory.hbm_write_bandwidth")
-    def _hbm_write_bandwidth(self, TCC_EA_WRREQ_sum, TCC_EA_WRREQ_64B_sum, GRBM_GUI_ACTIVE):
-        """
-        HBM write bandwidth in GB/s (with 32B/64B request granularity)
-
-        Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
-        """
-        # Calculate bytes with 32B/64B distinction
-        bytes_written_64B = TCC_EA_WRREQ_64B_sum * 64
-        bytes_written_32B = (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
-        bytes_written = bytes_written_64B + bytes_written_32B
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    @metric("memory.hbm_bandwidth_utilization")
-    def _hbm_bandwidth_utilization(
-        self,
-        TCC_EA_RDREQ_sum,
-        TCC_EA_RDREQ_32B_sum,
-        TCC_EA_WRREQ_sum,
-        TCC_EA_WRREQ_64B_sum,
-        GRBM_GUI_ACTIVE,
-    ):
-        """
-        HBM bandwidth utilization as percentage of peak
-
-        Formula: (actual_bandwidth / peak_bandwidth) * 100
-        """
-        # Calculate bytes with 32B/64B distinction
-        bytes_read = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32
-        bytes_written = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
-        total_bytes = bytes_read + bytes_written
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        actual_bw_gbs = (total_bytes / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-        return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100
-
-    @metric("memory.bytes_transferred_hbm")
-    def _bytes_transferred_hbm(
-        self, TCC_EA_RDREQ_sum, TCC_EA_RDREQ_32B_sum, TCC_EA_WRREQ_sum, TCC_EA_WRREQ_64B_sum
-    ):
-        """
-        Total bytes transferred through HBM
-
-        Formula: (64B_read_requests * 64 + 32B_read_requests * 32 +
-                  64B_write_requests * 64 + 32B_write_requests * 32)
-        """
-        bytes_read = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32
-        bytes_written = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
-        return bytes_read + bytes_written
-
-    @metric("memory.bytes_transferred_l2")
-    def _bytes_transferred_l2(self, TCC_REQ_sum):
-        """
-        Total bytes transferred through L2 cache
-
-        Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes)
-        """
-        return TCC_REQ_sum * 128
-
-    @metric("memory.bytes_transferred_l1")
-    def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum):
-        """
-        Total bytes transferred through L1 cache
-
-        Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 64 (L1 cache line size is 64 bytes)
-        """
-        return TCP_TOTAL_CACHE_ACCESSES_sum * 64
-
-    # Cache metrics
-
-    @metric("memory.l2_hit_rate")
-    def _l2_hit_rate(self, TCC_HIT_sum, TCC_MISS_sum):
-        """
-        L2 cache hit rate as percentage
-
-        Formula: (hits / (hits + misses)) * 100
-        """
-        total = TCC_HIT_sum + TCC_MISS_sum
-        return (TCC_HIT_sum / total) * 100 if total > 0 else 0.0
-
-    @metric("memory.l1_hit_rate")
-    def _l1_hit_rate(self, TCP_TCC_READ_REQ_sum, TCP_TOTAL_CACHE_ACCESSES_sum):
-        """
-        L1 cache hit rate as percentage
-
-        Formula: ((total_accesses - l1_misses) / total_accesses) * 100
-        L1 misses go to L2 (TCC), so misses = TCP_TCC_READ_REQ
-        """
-        if TCP_TOTAL_CACHE_ACCESSES_sum == 0:
-            return 0.0
-
-        l1_hits = TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum
-        return (l1_hits / TCP_TOTAL_CACHE_ACCESSES_sum) * 100
-
-    @metric("memory.l2_bandwidth")
-    def _l2_bandwidth(self, TCC_HIT_sum, TCC_MISS_sum, GRBM_GUI_ACTIVE):
-        """
-        L2 cache bandwidth in GB/s
-
-        Formula: (total_accesses * 128 bytes) / time
-        Note: L2 cacheline is 128 bytes
-        """
-        total_accesses = TCC_HIT_sum + TCC_MISS_sum
-        bytes_accessed = total_accesses * 128  # L2 cacheline size
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_accessed / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    # Coalescing metrics
-
-    @metric("memory.coalescing_efficiency")
-    def _coalescing_efficiency(self, SQ_INSTS_VMEM_RD, SQ_INSTS_VMEM_WR, TCP_TOTAL_ACCESSES_sum):
-        """
-        Memory coalescing efficiency as percentage
-
-        Formula: (total_memory_instructions * 16 / total_cache_accesses) * 100
-
-        Physical meaning:
-        - Perfect coalescing (stride=1): 100% (minimal cache accesses)
-        - Poor coalescing (stride>1): 25% for float, 50% for double
-
-        This represents actual bandwidth efficiency, not rescaled.
-        """
-        total_instructions = SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR
-
-        if TCP_TOTAL_ACCESSES_sum == 0:
-            return 0.0
-
-        # 16 = 64 threads per wavefront / 4 threads per cacheline
-        efficiency = (total_instructions * 16 / TCP_TOTAL_ACCESSES_sum) * 100
-
-        # Cap at 100% (can happen due to prefetching)
-        return min(efficiency, 100.0)
-
-    @metric("memory.global_load_efficiency")
-    def _global_load_efficiency(self, SQ_INSTS_VMEM_RD, TCP_TCC_READ_REQ_sum):
-        """
-        Global load efficiency - ratio of requested vs fetched memory
-
-        Formula: (read_instructions * 64 bytes / read_requests * 64 bytes) * 100
-        Simplifies to: (read_instructions / read_requests) * 100
-        """
-        if TCP_TCC_READ_REQ_sum == 0:
-            return 0.0
-
-        return min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0)
-
-    @metric("memory.global_store_efficiency")
-    def _global_store_efficiency(self, SQ_INSTS_VMEM_WR, TCP_TCC_WRITE_REQ_sum):
-        """
-        Global store efficiency - ratio of requested vs written memory
-
-        Formula: (write_instructions / write_requests) * 100
-        """
-        if TCP_TCC_WRITE_REQ_sum == 0:
-            return 0.0
-
-        return min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0)
-
-    # LDS metrics
-
-    @metric("memory.lds_bank_conflicts")
-    def _lds_bank_conflicts(self, SQ_LDS_BANK_CONFLICT, SQ_INSTS_LDS):
-        """
-        LDS bank conflicts per instruction
-
-        Formula: total_conflicts / total_lds_instructions
-        """
-        if SQ_INSTS_LDS == 0:
-            return 0.0
-
-        return SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS
-
-    # Atomic metrics
-
-    @metric(
-        "memory.atomic_latency",
-        unsupported_reason="TCC_EA_ATOMIC_LEVEL_sum counter is broken on MI200 (gfx90a). "
-        "This metric only works correctly on MI300X (gfx942) and newer GPUs.",
-    )
-    def _atomic_latency(self, TCC_EA_ATOMIC_LEVEL_sum, TCC_EA_ATOMIC_sum):
-        """
-        Average atomic operation latency in cycles (L2 cache atomic latency)
-
-        Formula: TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum (MI200 counters)
-
-        Note: This measures atomic operations to/from L2 cache, not GDS operations.
-        GDS (Global Data Share) is a special feature rarely used by most kernels.
-        """
-        if TCC_EA_ATOMIC_sum == 0:
-            return 0.0
-
-        return TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum
-
-    # Compute metrics
-
-    @metric("compute.total_flops")
-    def _total_flops(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-    ):
-        """
-        Total floating-point operations performed by the kernel
-
-        Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA
-        - 64 operations per wave (wavefront size = 64)
-        - FMA counts as 2 operations (multiply + add)
-        - MFMA instructions produce 512 operations per instruction
-        """
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        return fops
-
-    @metric("compute.hbm_gflops")
-    def _hbm_gflops(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-    ):
-        """
-        Compute throughput (GFLOPS) using profiler kernel duration.
-
-        Formula: (total_flops / 1e9) / (duration_us / 1e6)
-        Duration is set by the base class from profiler timestamps before calling.
-        """
-        # Calculate total FLOPS (same as compute.total_flops)
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        duration_us = getattr(self, "_current_duration_us", 0.0)
-        if duration_us <= 0:
-            return 0.0
-
-        time_seconds = duration_us / 1e6
-        gflops = (fops / 1e9) / time_seconds
-
-        return gflops
-
-    @metric("compute.hbm_arithmetic_intensity")
-    def _hbm_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCC_EA_RDREQ_sum,
-        TCC_EA_RDREQ_32B_sum,
-        TCC_EA_WRREQ_sum,
-        TCC_EA_WRREQ_64B_sum,
-    ):
-        """
-        HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte)
-
-        Formula: total_flops / hbm_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate HBM bytes (with 32B/64B/128B distinction)
-        hbm_rd = (TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64 + TCC_EA_RDREQ_32B_sum * 32
-        hbm_wr = TCC_EA_WRREQ_64B_sum * 64 + (TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32
-        hbm_bytes = hbm_rd + hbm_wr
-
-        # Arithmetic intensity = FLOP / byte
-        ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0
-
-        return ai_hbm
-
-    @metric("compute.l2_arithmetic_intensity")
-    def _l2_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCC_REQ_sum,
-    ):
-        """
-        L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)
-
-        Formula: total_flops / l2_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate L2 bytes (L2 cache line is 128 bytes)
-        l2_bytes = TCC_REQ_sum * 128
-
-        # Arithmetic intensity = FLOP / byte
-        ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0
-
-        return ai_l2
-
-    @metric("compute.l1_arithmetic_intensity")
-    def _l1_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCP_TOTAL_CACHE_ACCESSES_sum,
-    ):
-        """
-        L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)
-
-        Formula: total_flops / l1_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate L1 bytes (L1 cache line is 64 bytes on gfx90a)
-        l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 64
-
-        # Arithmetic intensity = FLOP / byte
-        ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0
-
-        return ai_l1
diff --git a/metrix/src/metrix/backends/gfx942.py b/metrix/src/metrix/backends/gfx942.py
index 3391012..19f974c 100644
--- a/metrix/src/metrix/backends/gfx942.py
+++ b/metrix/src/metrix/backends/gfx942.py
@@ -1,13 +1,14 @@
 """
 GFX942 (MI300X) Backend
 
-Each metric is defined with @metric decorator.
-Counter names appear EXACTLY ONCE - as function parameters.
+Metrics are loaded from counter_defs.yaml.
+This file provides architecture-specific infrastructure only.
+Device specs are queried from rocminfo / rocm-smi at runtime.
 """
 
 from .base import CounterBackend, DeviceSpecs, ProfileResult
+from .device_info import query_device_specs
 from ..utils.common import split_counters_into_passes
-from .decorator import metric
 from ..profiler.rocprof_wrapper import ROCProfV3Wrapper
 from typing import List, Optional, Dict
 
@@ -16,28 +17,11 @@ class GFX942Backend(CounterBackend):
     """
     AMD MI300X (gfx942) counter backend
 
-    All metrics are defined with @metric decorator.
-    Hardware counter names appear ONLY as function parameter names.
+    Metric definitions live in counter_defs.yaml.
     """
 
     def _get_device_specs(self) -> DeviceSpecs:
-        """MI300X specifications"""
-        return DeviceSpecs(
-            arch="gfx942",
-            name="AMD Instinct MI300X",
-            num_cu=304,
-            max_waves_per_cu=32,
-            wavefront_size=64,
-            base_clock_mhz=2100.0,
-            hbm_bandwidth_gbs=5300.0,
-            l2_bandwidth_gbs=11000.0,
-            l2_size_mb=256.0,
-            lds_size_per_cu_kb=64.0,
-            fp32_tflops=163.4,
-            fp64_tflops=81.7,
-            int8_tops=1307.4,
-            boost_clock_mhz=2100,
-        )
+        return query_device_specs("gfx942")
 
     def _get_counter_groups(self, counters: List[str]) -> List[List[str]]:
         """
@@ -63,33 +47,18 @@ def _get_counter_block_limits(self) -> Dict[str, int]:
 
         These limits define how many performance counters can be simultaneously
         collected from each hardware block in a single profiling pass.
-
-        Hardware blocks on MI300X:
-        - SQ (Shader): Instruction counters (VALU, LDS, VMEM, etc.)
-        - TA (Texture Addresser): Texture address operations
-        - TD (Texture Data): Texture data fetch operations
-        - TCP (Texture Cache per Pipe): L1 vector cache
-        - TCC (Texture Cache Channel): L2 cache and memory controller
-        - CPC (Command Processor - Compute): Compute command processing
-        - CPF (Command Processor - Fetch): Command fetch operations
-        - SPI (Shader Processor Input): Wavefront dispatch and scheduling
-        - GRBM (Graphics Register Bus Manager): Global GPU activity
-        - GDS (Global Data Share): Inter-workgroup communication
-
-        Returns:
-            Dict mapping block_name -> max_counters_per_pass
         """
         return {
-            "SQ": 8,  # Shader - instruction counters
-            "TA": 2,  # Texture Addresser
-            "TD": 2,  # Texture Data
-            "TCP": 4,  # L1 Cache (Texture Cache per Pipe)
-            "TCC": 4,  # L2 Cache / Memory Controller
-            "CPC": 2,  # Command Processor - Compute
-            "CPF": 2,  # Command Processor - Fetch
-            "SPI": 6,  # Shader Processor Input
-            "GRBM": 2,  # Graphics Register Bus Manager
-            "GDS": 4,  # Global Data Share
+            "SQ": 8,
+            "TA": 2,
+            "TD": 2,
+            "TCP": 4,
+            "TCC": 4,
+            "CPC": 2,
+            "CPF": 2,
+            "SPI": 6,
+            "GRBM": 2,
+            "GDS": 4,
         }
 
     def _run_rocprof(
@@ -103,570 +72,3 @@ def _run_rocprof(
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
         return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
-
-    # Memory bandwidth metrics
-
-    @metric("memory.hbm_read_bandwidth")
-    def _hbm_read_bandwidth(
-        self, TCC_EA0_RDREQ_sum, TCC_EA0_RDREQ_32B_sum, TCC_BUBBLE_sum, GRBM_GUI_ACTIVE
-    ):
-        """
-        HBM read bandwidth in GB/s
-
-        Formula: (128B_requests * 128 + 64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
-
-        Note: TCC_EA0_RDREQ_sum aggregates across all memory controllers on MI300
-              TCC_BUBBLE_sum counts 128B read requests
-        """
-        # Calculate bytes with 32B/64B/128B distinction
-        bytes_read_128B = TCC_BUBBLE_sum * 128
-        bytes_read_64B = (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
-        bytes_read_32B = TCC_EA0_RDREQ_32B_sum * 32
-        bytes_read = bytes_read_128B + bytes_read_64B + bytes_read_32B
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_read / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    @metric("memory.hbm_write_bandwidth")
-    def _hbm_write_bandwidth(self, TCC_EA0_WRREQ_sum, TCC_EA0_WRREQ_64B_sum, GRBM_GUI_ACTIVE):
-        """
-        HBM write bandwidth in GB/s (with 32B/64B request granularity)
-
-        Formula: (64B_requests * 64 + 32B_requests * 32) / (active_cycles / clock_freq)
-
-        Note: TCC_EA0_WRREQ_sum aggregates across all memory controllers on MI300
-        """
-        # Calculate bytes with 32B/64B distinction
-        bytes_written_64B = TCC_EA0_WRREQ_64B_sum * 64
-        bytes_written_32B = (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
-        bytes_written = bytes_written_64B + bytes_written_32B
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_written / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    @metric("memory.hbm_bandwidth_utilization")
-    def _hbm_bandwidth_utilization(
-        self,
-        TCC_EA0_RDREQ_sum,
-        TCC_EA0_RDREQ_32B_sum,
-        TCC_BUBBLE_sum,
-        TCC_EA0_WRREQ_sum,
-        TCC_EA0_WRREQ_64B_sum,
-        GRBM_GUI_ACTIVE,
-    ):
-        """
-        HBM bandwidth utilization as percentage of peak
-
-        Formula: (actual_bandwidth / peak_bandwidth) * 100
-
-        Note: TCC_EA0_* counters aggregate across all memory controllers on MI300
-              TCC_BUBBLE_sum counts 128B read requests
-        """
-        # Calculate bytes with 32B/64B/128B distinction
-        bytes_read = (
-            TCC_BUBBLE_sum * 128
-            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
-            + TCC_EA0_RDREQ_32B_sum * 32
-        )
-        bytes_written = (
-            TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
-        )
-        total_bytes = bytes_read + bytes_written
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        actual_bw_gbs = (total_bytes / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-        return (actual_bw_gbs / self.device_specs.hbm_bandwidth_gbs) * 100
-
-    @metric("memory.bytes_transferred_hbm")
-    def _bytes_transferred_hbm(
-        self,
-        TCC_EA0_RDREQ_sum,
-        TCC_EA0_RDREQ_32B_sum,
-        TCC_BUBBLE_sum,
-        TCC_EA0_WRREQ_sum,
-        TCC_EA0_WRREQ_64B_sum,
-    ):
-        """
-        Total bytes transferred through HBM
-
-        Formula: (128B_read_requests * 128 + 64B_read_requests * 64 + 32B_read_requests * 32 +
-                  64B_write_requests * 64 + 32B_write_requests * 32)
-
-        Note: TCC_EA0_* counters aggregate across all memory controllers on MI300
-              TCC_BUBBLE_sum counts 128B read requests
-        """
-        bytes_read = (
-            TCC_BUBBLE_sum * 128
-            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
-            + TCC_EA0_RDREQ_32B_sum * 32
-        )
-        bytes_written = (
-            TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
-        )
-        return bytes_read + bytes_written
-
-    @metric("memory.bytes_transferred_l2")
-    def _bytes_transferred_l2(self, TCC_REQ_sum):
-        """
-        Total bytes transferred through L2 cache
-
-        Formula: TCC_REQ_sum * 128 (L2 cache line size is 128 bytes)
-        """
-        return TCC_REQ_sum * 128
-
-    @metric("memory.bytes_transferred_l1")
-    def _bytes_transferred_l1(self, TCP_TOTAL_CACHE_ACCESSES_sum):
-        """
-        Total bytes transferred through L1 cache
-
-        Formula: TCP_TOTAL_CACHE_ACCESSES_sum * 128 (L1 cache line size is 128 bytes)
-        """
-        return TCP_TOTAL_CACHE_ACCESSES_sum * 128
-
-    # Cache metrics
-
-    @metric("memory.l2_hit_rate")
-    def _l2_hit_rate(self, TCC_HIT_sum, TCC_MISS_sum):
-        """
-        L2 cache hit rate as percentage
-
-        Formula: (hits / (hits + misses)) * 100
-        """
-        total = TCC_HIT_sum + TCC_MISS_sum
-        return (TCC_HIT_sum / total) * 100 if total > 0 else 0.0
-
-    @metric("memory.l1_hit_rate")
-    def _l1_hit_rate(self, TCP_TCC_READ_REQ_sum, TCP_TOTAL_CACHE_ACCESSES_sum):
-        """
-        L1 cache hit rate as percentage
-
-        Formula: ((total_accesses - l1_misses) / total_accesses) * 100
-        L1 misses go to L2 (TCC), so misses = TCP_TCC_READ_REQ
-        """
-        if TCP_TOTAL_CACHE_ACCESSES_sum == 0:
-            return 0.0
-
-        l1_hits = TCP_TOTAL_CACHE_ACCESSES_sum - TCP_TCC_READ_REQ_sum
-        return (l1_hits / TCP_TOTAL_CACHE_ACCESSES_sum) * 100
-
-    @metric("memory.l2_bandwidth")
-    def _l2_bandwidth(self, TCC_HIT_sum, TCC_MISS_sum, GRBM_GUI_ACTIVE):
-        """
-        L2 cache bandwidth in GB/s
-
-        Formula: (total_accesses * 128 bytes) / time
-        Note: L2 cacheline is 128 bytes
-        """
-        total_accesses = TCC_HIT_sum + TCC_MISS_sum
-        bytes_accessed = total_accesses * 128  # L2 cacheline size
-
-        if GRBM_GUI_ACTIVE == 0:
-            return 0.0
-
-        time_seconds = GRBM_GUI_ACTIVE / (self.device_specs.base_clock_mhz * 1e6)
-        return (bytes_accessed / 1e9) / time_seconds if time_seconds > 0 else 0.0
-
-    # Coalescing metrics
-
-    @metric("memory.coalescing_efficiency")
-    def _coalescing_efficiency(self, SQ_INSTS_VMEM_RD, SQ_INSTS_VMEM_WR, TCP_TOTAL_ACCESSES_sum):
-        """
-        Memory coalescing efficiency as percentage
-
-        Formula: (total_memory_instructions * 16 / total_cache_accesses) * 100
-
-        Physical meaning:
-        - Perfect coalescing (stride=1): 100% (minimal cache accesses)
-        - Poor coalescing (stride>1): 25% for float, 50% for double
-
-        This represents actual bandwidth efficiency, not rescaled.
-        """
-        total_instructions = SQ_INSTS_VMEM_RD + SQ_INSTS_VMEM_WR
-
-        if TCP_TOTAL_ACCESSES_sum == 0:
-            return 0.0
-
-        # 16 = 64 threads per wavefront / 4 threads per cacheline
-        efficiency = (total_instructions * 16 / TCP_TOTAL_ACCESSES_sum) * 100
-
-        # Cap at 100% (can happen due to prefetching)
-        return min(efficiency, 100.0)
-
-    @metric("memory.global_load_efficiency")
-    def _global_load_efficiency(self, SQ_INSTS_VMEM_RD, TCP_TCC_READ_REQ_sum):
-        """
-        Global load efficiency - ratio of requested vs fetched memory
-
-        Formula: (read_instructions * 64 bytes / read_requests * 64 bytes) * 100
-        Simplifies to: (read_instructions / read_requests) * 100
-        """
-        if TCP_TCC_READ_REQ_sum == 0:
-            return 0.0
-
-        return min((SQ_INSTS_VMEM_RD / TCP_TCC_READ_REQ_sum) * 100, 100.0)
-
-    @metric("memory.global_store_efficiency")
-    def _global_store_efficiency(self, SQ_INSTS_VMEM_WR, TCP_TCC_WRITE_REQ_sum):
-        """
-        Global store efficiency - ratio of requested vs written memory
-
-        Formula: (write_instructions / write_requests) * 100
-        """
-        if TCP_TCC_WRITE_REQ_sum == 0:
-            return 0.0
-
-        return min((SQ_INSTS_VMEM_WR / TCP_TCC_WRITE_REQ_sum) * 100, 100.0)
-
-    # LDS metrics
-
-    @metric("memory.lds_bank_conflicts")
-    def _lds_bank_conflicts(self, SQ_LDS_BANK_CONFLICT, SQ_INSTS_LDS):
-        """
-        LDS bank conflicts per instruction
-
-        Formula: total_conflicts / total_lds_instructions
-        """
-        if SQ_INSTS_LDS == 0:
-            return 0.0
-
-        return SQ_LDS_BANK_CONFLICT / SQ_INSTS_LDS
-
-    # Atomic metrics
-
-    @metric("memory.atomic_latency")
-    def _atomic_latency(self, TCC_EA0_ATOMIC_LEVEL_sum, TCC_EA0_ATOMIC_sum):
-        """
-        Average atomic operation latency in cycles (L2 cache atomic latency)
-
-        Formula: TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum (MI300/MI350 counters)
-
-        Note: This measures atomic operations to/from L2 cache, not GDS operations.
-        GDS (Global Data Share) is a special feature rarely used by most kernels.
-        """
-        if TCC_EA0_ATOMIC_sum == 0:
-            return 0.0
-
-        return TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum
-
-    # Compute metrics
-
-    @metric("compute.total_flops")
-    def _total_flops(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-    ):
-        """
-        Total floating-point operations performed by the kernel
-
-        Formula: 64 * (FP16 + FP32 + FP64) + 512 * MFMA
-        - 64 operations per wave (wavefront size = 64)
-        - FMA counts as 2 operations (multiply + add)
-        - MFMA instructions produce 512 operations per instruction
-        """
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        return fops
-
-    @metric("compute.hbm_gflops")
-    def _hbm_gflops(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-    ):
-        """
-        Compute throughput (GFLOPS) using profiler kernel duration.
-
-        Formula: (total_flops / 1e9) / (duration_us / 1e6)
-        Duration is set by the base class from profiler timestamps before calling.
-        """
-        # Calculate total FLOPS (same as compute.total_flops)
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        duration_us = getattr(self, "_current_duration_us", 0.0)
-        if duration_us <= 0:
-            return 0.0
-
-        time_seconds = duration_us / 1e6
-        gflops = (fops / 1e9) / time_seconds
-
-        return gflops
-
-    @metric("compute.hbm_arithmetic_intensity")
-    def _hbm_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCC_EA0_RDREQ_sum,
-        TCC_EA0_RDREQ_32B_sum,
-        TCC_BUBBLE_sum,
-        TCC_EA0_WRREQ_sum,
-        TCC_EA0_WRREQ_64B_sum,
-    ):
-        """
-        HBM Arithmetic Intensity: ratio of floating-point operations to HBM bytes transferred (FLOP/byte)
-
-        Formula: total_flops / hbm_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate HBM bytes (with 32B/64B/128B distinction)
-        hbm_rd = (
-            TCC_BUBBLE_sum * 128
-            + (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64
-            + TCC_EA0_RDREQ_32B_sum * 32
-        )
-        hbm_wr = TCC_EA0_WRREQ_64B_sum * 64 + (TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32
-        hbm_bytes = hbm_rd + hbm_wr
-
-        # Arithmetic intensity = FLOP / byte
-        ai_hbm = fops / hbm_bytes if hbm_bytes > 0 else 0.0
-
-        return ai_hbm
-
-    @metric("compute.l2_arithmetic_intensity")
-    def _l2_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCC_REQ_sum,
-    ):
-        """
-        L2 Arithmetic Intensity: ratio of floating-point operations to L2 cache bytes accessed (FLOP/byte)
-
-        Formula: total_flops / l2_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate L2 bytes (L2 cache line is 128 bytes)
-        l2_bytes = TCC_REQ_sum * 128
-
-        # Arithmetic intensity = FLOP / byte
-        ai_l2 = fops / l2_bytes if l2_bytes > 0 else 0.0
-
-        return ai_l2
-
-    @metric("compute.l1_arithmetic_intensity")
-    def _l1_arithmetic_intensity(
-        self,
-        SQ_INSTS_VALU_ADD_F16,
-        SQ_INSTS_VALU_MUL_F16,
-        SQ_INSTS_VALU_TRANS_F16,
-        SQ_INSTS_VALU_FMA_F16,
-        SQ_INSTS_VALU_ADD_F32,
-        SQ_INSTS_VALU_MUL_F32,
-        SQ_INSTS_VALU_TRANS_F32,
-        SQ_INSTS_VALU_FMA_F32,
-        SQ_INSTS_VALU_ADD_F64,
-        SQ_INSTS_VALU_MUL_F64,
-        SQ_INSTS_VALU_TRANS_F64,
-        SQ_INSTS_VALU_FMA_F64,
-        SQ_INSTS_VALU_MFMA_MOPS_F16,
-        SQ_INSTS_VALU_MFMA_MOPS_BF16,
-        SQ_INSTS_VALU_MFMA_MOPS_F32,
-        SQ_INSTS_VALU_MFMA_MOPS_F64,
-        TCP_TOTAL_CACHE_ACCESSES_sum,
-    ):
-        """
-        L1 Arithmetic Intensity: ratio of floating-point operations to L1 cache bytes accessed (FLOP/byte)
-
-        Formula: total_flops / l1_bytes
-        """
-        # Calculate total FLOPS
-        fops = 64 * (
-            (
-                SQ_INSTS_VALU_ADD_F16
-                + SQ_INSTS_VALU_MUL_F16
-                + SQ_INSTS_VALU_TRANS_F16
-                + SQ_INSTS_VALU_FMA_F16 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F32
-                + SQ_INSTS_VALU_MUL_F32
-                + SQ_INSTS_VALU_TRANS_F32
-                + SQ_INSTS_VALU_FMA_F32 * 2
-            )
-            + (
-                SQ_INSTS_VALU_ADD_F64
-                + SQ_INSTS_VALU_MUL_F64
-                + SQ_INSTS_VALU_TRANS_F64
-                + SQ_INSTS_VALU_FMA_F64 * 2
-            )
-        ) + 512 * (
-            SQ_INSTS_VALU_MFMA_MOPS_F16
-            + SQ_INSTS_VALU_MFMA_MOPS_BF16
-            + SQ_INSTS_VALU_MFMA_MOPS_F32
-            + SQ_INSTS_VALU_MFMA_MOPS_F64
-        )
-
-        # Calculate L1 bytes (L1 cache line is 128 bytes on gfx942)
-        l1_bytes = TCP_TOTAL_CACHE_ACCESSES_sum * 128
-
-        # Arithmetic intensity = FLOP / byte
-        ai_l1 = fops / l1_bytes if l1_bytes > 0 else 0.0
-
-        return ai_l1
diff --git a/metrix/tests/unit/backends/test_backend_metrics.py b/metrix/tests/unit/backends/test_backend_metrics.py
index 804747b..faf45a9 100644
--- a/metrix/tests/unit/backends/test_backend_metrics.py
+++ b/metrix/tests/unit/backends/test_backend_metrics.py
@@ -2,7 +2,8 @@
 Unit tests for backend metric computations (gfx942 and gfx90a)
 
 Tests use MOCK counter data (no hardware counters in test code!)
-Tests are parametrized to run on both MI300X (gfx942) and MI200 (gfx90a)
+Tests are parametrized to run on both MI300X (gfx942) and MI200 (gfx90a).
+All metrics are loaded from counter_defs.yaml.
 """
 
 import pytest
@@ -15,6 +16,11 @@ def backend(request):
     return get_backend(request.param)
 
 
+def compute(backend, metric_name):
+    """Invoke the YAML-loaded metric compute function"""
+    return backend._metrics[metric_name]["compute"]()
+
+
 def get_arch_counter_names(backend, base_names):
     """
     Map counter names based on backend architecture.
@@ -24,7 +30,6 @@ def get_arch_counter_names(backend, base_names):
     arch = backend.device_specs.arch
 
     if arch == "gfx942":
-        # MI300X counter mapping
         mapping = {
             "TCC_EA_RDREQ_sum": "TCC_EA0_RDREQ_sum",
             "TCC_EA_RDREQ_32B_sum": "TCC_EA0_RDREQ_32B_sum",
@@ -33,8 +38,7 @@ def get_arch_counter_names(backend, base_names):
             "TCC_EA_ATOMIC_sum": "TCC_EA0_ATOMIC_sum",
             "TCC_EA_ATOMIC_LEVEL_sum": "TCC_EA0_ATOMIC_LEVEL_sum",
         }
-    else:  # gfx90a
-        # MI200 uses base names as-is
+    else:
         mapping = {}
 
     result = {}
@@ -51,28 +55,28 @@ def test_perfect_hit_rate(self, backend):
         """100% hit rate"""
         backend._raw_data = {"TCC_HIT_sum": 1000, "TCC_MISS_sum": 0}
 
-        result = backend._l2_hit_rate()
+        result = compute(backend, "memory.l2_hit_rate")
         assert result == 100.0
 
     def test_zero_hit_rate(self, backend):
         """0% hit rate (all misses)"""
         backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 1000}
 
-        result = backend._l2_hit_rate()
+        result = compute(backend, "memory.l2_hit_rate")
         assert result == 0.0
 
     def test_fifty_percent_hit_rate(self, backend):
         """50% hit rate"""
         backend._raw_data = {"TCC_HIT_sum": 500, "TCC_MISS_sum": 500}
 
-        result = backend._l2_hit_rate()
+        result = compute(backend, "memory.l2_hit_rate")
         assert result == 50.0
 
     def test_no_accesses(self, backend):
         """Handle zero total accesses"""
         backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 0}
 
-        result = backend._l2_hit_rate()
+        result = compute(backend, "memory.l2_hit_rate")
         assert result == 0.0
 
 
@@ -84,10 +88,10 @@ def test_perfect_coalescing(self, backend):
         backend._raw_data = {
             "SQ_INSTS_VMEM_RD": 100,
             "SQ_INSTS_VMEM_WR": 0,
-            "TCP_TOTAL_ACCESSES_sum": 1600,  # 100 * 16
+            "TCP_TOTAL_ACCESSES_sum": 1600,
         }
 
-        result = backend._coalescing_efficiency()
+        result = compute(backend, "memory.coalescing_efficiency")
         assert result == 100.0
 
     def test_poor_coalescing(self, backend):
@@ -95,10 +99,10 @@ def test_poor_coalescing(self, backend):
         backend._raw_data = {
             "SQ_INSTS_VMEM_RD": 100,
             "SQ_INSTS_VMEM_WR": 0,
-            "TCP_TOTAL_ACCESSES_sum": 6400,  # 4x more accesses
+            "TCP_TOTAL_ACCESSES_sum": 6400,
         }
 
-        result = backend._coalescing_efficiency()
+        result = compute(backend, "memory.coalescing_efficiency")
         assert result == 25.0
 
     def test_mixed_read_write(self, backend):
@@ -106,10 +110,10 @@ def test_mixed_read_write(self, backend):
         backend._raw_data = {
             "SQ_INSTS_VMEM_RD": 50,
             "SQ_INSTS_VMEM_WR": 50,
-            "TCP_TOTAL_ACCESSES_sum": 1600,  # (50 + 50) * 16
+            "TCP_TOTAL_ACCESSES_sum": 1600,
         }
 
-        result = backend._coalescing_efficiency()
+        result = compute(backend, "memory.coalescing_efficiency")
         assert result == 100.0
 
     def test_no_memory_instructions(self, backend):
@@ -120,7 +124,7 @@ def test_no_memory_instructions(self, backend):
             "TCP_TOTAL_ACCESSES_sum": 1000,
         }
 
-        result = backend._coalescing_efficiency()
+        result = compute(backend, "memory.coalescing_efficiency")
         assert result == 0.0
 
 
@@ -131,21 +135,21 @@ def test_no_conflicts(self, backend):
         """Perfect LDS access pattern"""
         backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 0, "SQ_INSTS_LDS": 1000}
 
-        result = backend._lds_bank_conflicts()
+        result = compute(backend, "memory.lds_bank_conflicts")
         assert result == 0.0
 
     def test_high_conflicts(self, backend):
         """2 conflicts per instruction"""
         backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 2000, "SQ_INSTS_LDS": 1000}
 
-        result = backend._lds_bank_conflicts()
+        result = compute(backend, "memory.lds_bank_conflicts")
         assert result == 2.0
 
     def test_no_lds_instructions(self, backend):
         """Handle zero LDS instructions"""
         backend._raw_data = {"SQ_LDS_BANK_CONFLICT": 100, "SQ_INSTS_LDS": 0}
 
-        result = backend._lds_bank_conflicts()
+        result = compute(backend, "memory.lds_bank_conflicts")
         assert result == 0.0
 
 
@@ -155,19 +159,17 @@ class TestBandwidthMetrics:
     def test_hbm_read_bandwidth_64b_only(self, backend):
         """Test read bandwidth with only 64B requests"""
         arch = backend.device_specs.arch
-        clock_mhz = backend.device_specs.base_clock_mhz
 
-        # Time calculation based on architecture clock speed
         if arch == "gfx942":
-            active_cycles = 2100000  # 1 ms at 2.1 GHz
+            active_cycles = 2100000
             counters = {
                 "TCC_EA_RDREQ_sum": 1000,
                 "TCC_EA_RDREQ_32B_sum": 0,
-                "TCC_BUBBLE_sum": 0,  # gfx942 has this counter
+                "TCC_BUBBLE_sum": 0,
                 "GRBM_GUI_ACTIVE": active_cycles,
             }
-        else:  # gfx90a
-            active_cycles = 1700000  # 1 ms at 1.7 GHz
+        else:
+            active_cycles = 1700000
             counters = {
                 "TCC_EA_RDREQ_sum": 1000,
                 "TCC_EA_RDREQ_32B_sum": 0,
@@ -175,8 +177,7 @@ def test_hbm_read_bandwidth_64b_only(self, backend):
             }
 
         backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._hbm_read_bandwidth()
-        # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s
+        result = compute(backend, "memory.hbm_read_bandwidth")
         assert 0.06 < result < 0.07
 
     def test_hbm_read_bandwidth_mixed_sizes(self, backend):
@@ -184,30 +185,25 @@ def test_hbm_read_bandwidth_mixed_sizes(self, backend):
         arch = backend.device_specs.arch
 
         if arch == "gfx942":
-            # MI300X with 128B bubble requests
-            active_cycles = 2100000  # 1 ms at 2.1 GHz
+            active_cycles = 2100000
             counters = {
                 "TCC_EA_RDREQ_sum": 1000,
                 "TCC_EA_RDREQ_32B_sum": 200,
-                "TCC_BUBBLE_sum": 300,  # 300 × 128B = 38400 bytes
+                "TCC_BUBBLE_sum": 300,
                 "GRBM_GUI_ACTIVE": active_cycles,
             }
-            # Remaining: 1000 - 200 - 300 = 500 × 64B = 32000 bytes
-            # Total: 6400 + 38400 + 32000 = 76800 bytes
             expected_min, expected_max = 0.07, 0.08
-        else:  # gfx90a
-            # MI200 without 128B counter (all 64B or 32B)
-            active_cycles = 1700000  # 1 ms at 1.7 GHz
+        else:
+            active_cycles = 1700000
             counters = {
                 "TCC_EA_RDREQ_sum": 1000,
                 "TCC_EA_RDREQ_32B_sum": 400,
                 "GRBM_GUI_ACTIVE": active_cycles,
             }
-            # 400 × 32B = 12800, 600 × 64B = 38400, Total = 51200 bytes
             expected_min, expected_max = 0.05, 0.06
 
         backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._hbm_read_bandwidth()
+        result = compute(backend, "memory.hbm_read_bandwidth")
         assert expected_min < result < expected_max
 
     def test_hbm_write_bandwidth_64b_only(self, backend):
@@ -215,9 +211,9 @@ def test_hbm_write_bandwidth_64b_only(self, backend):
         arch = backend.device_specs.arch
 
         if arch == "gfx942":
-            active_cycles = 2100000  # 1 ms at 2.1 GHz
-        else:  # gfx90a
-            active_cycles = 1700000  # 1 ms at 1.7 GHz
+            active_cycles = 2100000
+        else:
+            active_cycles = 1700000
 
         counters = {
             "TCC_EA_WRREQ_sum": 1000,
@@ -226,8 +222,7 @@ def test_hbm_write_bandwidth_64b_only(self, backend):
         }
 
         backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._hbm_write_bandwidth()
-        # (1000 * 64 bytes) / 0.001 seconds = 64 MB/s = 0.064 GB/s
+        result = compute(backend, "memory.hbm_write_bandwidth")
         assert 0.06 < result < 0.07
 
     def test_hbm_write_bandwidth_mixed_sizes(self, backend):
@@ -235,62 +230,62 @@ def test_hbm_write_bandwidth_mixed_sizes(self, backend):
         arch = backend.device_specs.arch
 
         if arch == "gfx942":
-            active_cycles = 2100000  # 1 ms at 2.1 GHz
-        else:  # gfx90a
-            active_cycles = 1700000  # 1 ms at 1.7 GHz
+            active_cycles = 2100000
+        else:
+            active_cycles = 1700000
 
         counters = {
             "TCC_EA_WRREQ_sum": 1000,
             "TCC_EA_WRREQ_64B_sum": 600,
             "GRBM_GUI_ACTIVE": active_cycles,
         }
-        # 600 × 64B = 38400, 400 × 32B = 12800, Total = 51200 bytes
 
         backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._hbm_write_bandwidth()
-        # 51200 / 1e9 / 0.001 = 0.0512 GB/s
+        result = compute(backend, "memory.hbm_write_bandwidth")
         assert 0.05 < result < 0.06
 
     def test_zero_active_cycles(self, backend):
         """Handle zero active cycles"""
         counters = {"TCC_EA_RDREQ_sum": 1000, "TCC_EA_RDREQ_32B_sum": 0, "GRBM_GUI_ACTIVE": 0}
 
-        # Add TCC_BUBBLE for gfx942
         if backend.device_specs.arch == "gfx942":
             counters["TCC_BUBBLE_sum"] = 0
 
         backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._hbm_read_bandwidth()
+        result = compute(backend, "memory.hbm_read_bandwidth")
         assert result == 0.0
 
 
 class TestAtomicLatency:
     """Test L2 cache atomic operation latency computation"""
 
-    def test_low_latency(self, backend):
+    @pytest.fixture(params=["gfx942"])
+    def atomic_backend(self, request):
+        """Only gfx942 supports atomic_latency (broken on gfx90a)"""
+        return get_backend(request.param)
+
+    def test_low_latency(self, atomic_backend):
         """10 cycles per atomic operation"""
         counters = {"TCC_EA_ATOMIC_sum": 1000, "TCC_EA_ATOMIC_LEVEL_sum": 10000}
 
-        backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._atomic_latency()
-        # 10000 / 1000 = 10 cycles per atomic
+        atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters)
+        result = compute(atomic_backend, "memory.atomic_latency")
         assert result == 10.0
 
-    def test_high_latency(self, backend):
+    def test_high_latency(self, atomic_backend):
         """1000 cycles per atomic (contention)"""
         counters = {"TCC_EA_ATOMIC_sum": 100, "TCC_EA_ATOMIC_LEVEL_sum": 100000}
 
-        backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._atomic_latency()
-        # 100000 / 100 = 1000 cycles per atomic
+        atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters)
+        result = compute(atomic_backend, "memory.atomic_latency")
         assert result == 1000.0
 
-    def test_no_atomics(self, backend):
+    def test_no_atomics(self, atomic_backend):
         """Handle zero atomic instructions"""
         counters = {"TCC_EA_ATOMIC_sum": 0, "TCC_EA_ATOMIC_LEVEL_sum": 5000}
 
-        backend._raw_data = get_arch_counter_names(backend, counters)
-        result = backend._atomic_latency()
+        atomic_backend._raw_data = get_arch_counter_names(atomic_backend, counters)
+        result = compute(atomic_backend, "memory.atomic_latency")
         assert result == 0.0
 
 
@@ -298,29 +293,24 @@ class TestMetricDiscovery:
     """Test backend auto-discovers metrics"""
 
     def test_discovers_all_metrics(self, backend):
-        """Backend should auto-discover all @metric decorated methods"""
+        """Backend should discover all YAML-defined metrics"""
         metrics = backend.get_available_metrics()
 
-        # Should have all the metrics we defined
         assert "memory.l2_hit_rate" in metrics
         assert "memory.coalescing_efficiency" in metrics
         assert "memory.lds_bank_conflicts" in metrics
         assert "memory.hbm_read_bandwidth" in metrics
 
-        # atomic_latency is architecture-specific
         if backend.device_specs.arch == "gfx90a":
-            # On MI200, atomic_latency is unsupported (broken counter)
             assert "memory.atomic_latency" not in metrics
             assert "memory.atomic_latency" in backend._unsupported_metrics
         else:
-            # On other architectures (gfx942, etc), it's supported
             assert "memory.atomic_latency" in metrics
 
     def test_get_required_counters(self, backend):
         """Backend should correctly report required counters for a metric"""
         counters = backend.get_required_counters(["memory.l2_hit_rate"])
 
-        # Should require TCC_HIT_sum and TCC_MISS_sum (counter names appear in function signature)
         assert "TCC_HIT_sum" in counters
         assert "TCC_MISS_sum" in counters
         assert len(counters) == 2
@@ -365,8 +355,7 @@ def test_total_flops_fp32_add(self, backend):
         backend._raw_data = self._get_zero_flops_counters()
         backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100
 
-        result = backend._total_flops()
-        # 64 threads per wave * 100 instructions = 6400 FLOPS
+        result = compute(backend, "compute.total_flops")
         assert result == 6400
 
     def test_total_flops_fma_counts_double(self, backend):
@@ -374,8 +363,7 @@ def test_total_flops_fma_counts_double(self, backend):
         backend._raw_data = self._get_zero_flops_counters()
         backend._raw_data["SQ_INSTS_VALU_FMA_F32"] = 100
 
-        result = backend._total_flops()
-        # 64 threads * 100 FMA * 2 ops = 12800 FLOPS
+        result = compute(backend, "compute.total_flops")
         assert result == 12800
 
     def test_total_flops_mfma_high_throughput(self, backend):
@@ -383,40 +371,39 @@ def test_total_flops_mfma_high_throughput(self, backend):
         backend._raw_data = self._get_zero_flops_counters()
         backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 10
 
-        result = backend._total_flops()
-        # 512 ops * 10 instructions = 5120 FLOPS
+        result = compute(backend, "compute.total_flops")
         assert result == 5120
 
     def test_total_flops_mixed_precision(self, backend):
         """Test FLOPS with mixed precision operations"""
         backend._raw_data = self._get_zero_flops_counters()
-        backend._raw_data["SQ_INSTS_VALU_ADD_F16"] = 100  # 6400 FLOPS
-        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 50  # 3200 FLOPS
-        backend._raw_data["SQ_INSTS_VALU_ADD_F64"] = 25  # 1600 FLOPS
+        backend._raw_data["SQ_INSTS_VALU_ADD_F16"] = 100
+        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 50
+        backend._raw_data["SQ_INSTS_VALU_ADD_F64"] = 25
 
-        result = backend._total_flops()
+        result = compute(backend, "compute.total_flops")
         assert result == 6400 + 3200 + 1600
 
     def test_total_flops_zero(self, backend):
         """Handle zero FLOPS gracefully"""
         backend._raw_data = self._get_zero_flops_counters()
 
-        result = backend._total_flops()
+        result = compute(backend, "compute.total_flops")
         assert result == 0
 
     def test_hbm_gflops_zero_time(self, backend):
-        """Handle zero active cycles"""
+        """Handle zero duration"""
         backend._raw_data = self._get_zero_flops_counters()
         backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
-        backend._raw_data["GRBM_GUI_ACTIVE"] = 0
+        backend._current_duration_us = 0.0
 
-        result = backend._hbm_gflops()
+        result = compute(backend, "compute.hbm_gflops")
         assert result == 0.0
 
     def test_hbm_arithmetic_intensity(self, backend):
         """Test HBM arithmetic intensity calculation"""
         backend._raw_data = self._get_zero_flops_counters()
-        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000  # 64000 FLOPS
+        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
 
         counters = {
             "TCC_EA_RDREQ_sum": 1000,
@@ -425,14 +412,12 @@ def test_hbm_arithmetic_intensity(self, backend):
             "TCC_EA_WRREQ_64B_sum": 0,
         }
 
-        # Add TCC_BUBBLE for gfx942
         if backend.device_specs.arch == "gfx942":
             counters["TCC_BUBBLE_sum"] = 0
 
         backend._raw_data.update(get_arch_counter_names(backend, counters))
 
-        result = backend._hbm_arithmetic_intensity()
-        # 64000 FLOPS / (1000 * 64 bytes) = 64000 / 64000 = 1.0 FLOP/byte
+        result = compute(backend, "compute.hbm_arithmetic_intensity")
         assert result == 1.0
 
     def test_hbm_arithmetic_intensity_zero_bytes(self, backend):
@@ -452,17 +437,16 @@ def test_hbm_arithmetic_intensity_zero_bytes(self, backend):
 
         backend._raw_data.update(get_arch_counter_names(backend, counters))
 
-        result = backend._hbm_arithmetic_intensity()
+        result = compute(backend, "compute.hbm_arithmetic_intensity")
         assert result == 0.0
 
     def test_l2_arithmetic_intensity(self, backend):
         """Test L2 arithmetic intensity calculation"""
         backend._raw_data = self._get_zero_flops_counters()
-        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000  # 64000 FLOPS
-        backend._raw_data["TCC_REQ_sum"] = 500  # 500 * 128 = 64000 bytes
+        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
+        backend._raw_data["TCC_REQ_sum"] = 500
 
-        result = backend._l2_arithmetic_intensity()
-        # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte
+        result = compute(backend, "compute.l2_arithmetic_intensity")
         assert result == 1.0
 
     def test_l2_arithmetic_intensity_zero_bytes(self, backend):
@@ -471,24 +455,20 @@ def test_l2_arithmetic_intensity_zero_bytes(self, backend):
         backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
         backend._raw_data["TCC_REQ_sum"] = 0
 
-        result = backend._l2_arithmetic_intensity()
+        result = compute(backend, "compute.l2_arithmetic_intensity")
         assert result == 0.0
 
     def test_l1_arithmetic_intensity(self, backend):
         """Test L1 arithmetic intensity calculation"""
         backend._raw_data = self._get_zero_flops_counters()
-        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000  # 64000 FLOPS
+        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
 
-        # L1 cache line size differs by architecture:
-        # gfx942 (MI300X): 128 bytes
-        # gfx90a (MI200): 64 bytes
         if backend.device_specs.arch == "gfx942":
-            backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 500  # 500 * 128 = 64000 bytes
-        else:  # gfx90a
-            backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 1000  # 1000 * 64 = 64000 bytes
+            backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 500
+        else:
+            backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 1000
 
-        result = backend._l1_arithmetic_intensity()
-        # 64000 FLOPS / 64000 bytes = 1.0 FLOP/byte
+        result = compute(backend, "compute.l1_arithmetic_intensity")
         assert result == 1.0
 
     def test_l1_arithmetic_intensity_zero_bytes(self, backend):
@@ -497,14 +477,13 @@ def test_l1_arithmetic_intensity_zero_bytes(self, backend):
         backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 1000
         backend._raw_data["TCP_TOTAL_CACHE_ACCESSES_sum"] = 0
 
-        result = backend._l1_arithmetic_intensity()
+        result = compute(backend, "compute.l1_arithmetic_intensity")
         assert result == 0.0
 
     def test_high_arithmetic_intensity_compute_bound(self, backend):
         """Test high AI indicates compute-bound kernel"""
         backend._raw_data = self._get_zero_flops_counters()
-        # Lots of compute, little memory
-        backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 1000  # 512000 FLOPS
+        backend._raw_data["SQ_INSTS_VALU_MFMA_MOPS_F32"] = 1000
 
         counters = {
             "TCC_EA_RDREQ_sum": 100,
@@ -518,15 +497,13 @@ def test_high_arithmetic_intensity_compute_bound(self, backend):
 
         backend._raw_data.update(get_arch_counter_names(backend, counters))
 
-        result = backend._hbm_arithmetic_intensity()
-        # 512000 / 6400 = 80 FLOP/byte (very compute-bound)
+        result = compute(backend, "compute.hbm_arithmetic_intensity")
         assert result == 80.0
 
     def test_low_arithmetic_intensity_memory_bound(self, backend):
         """Test low AI indicates memory-bound kernel"""
         backend._raw_data = self._get_zero_flops_counters()
-        # Little compute, lots of memory
-        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100  # 6400 FLOPS
+        backend._raw_data["SQ_INSTS_VALU_ADD_F32"] = 100
 
         counters = {
             "TCC_EA_RDREQ_sum": 10000,
@@ -540,6 +517,5 @@ def test_low_arithmetic_intensity_memory_bound(self, backend):
 
         backend._raw_data.update(get_arch_counter_names(backend, counters))
 
-        result = backend._hbm_arithmetic_intensity()
-        # 6400 / 640000 = 0.01 FLOP/byte (very memory-bound)
+        result = compute(backend, "compute.hbm_arithmetic_intensity")
         assert result == 0.01
diff --git a/metrix/tests/unit/test_error_handling.py b/metrix/tests/unit/test_error_handling.py
index 91dec21..36a3bc8 100644
--- a/metrix/tests/unit/test_error_handling.py
+++ b/metrix/tests/unit/test_error_handling.py
@@ -174,7 +174,7 @@ def test_division_by_zero_handling(self, arch):
         backend._raw_data = {"TCC_HIT_sum": 0, "TCC_MISS_sum": 0}
 
         # Should return 0.0, not raise ZeroDivisionError
-        result = backend._l2_hit_rate()
+        result = backend._metrics["memory.l2_hit_rate"]["compute"]()
         assert result == 0.0
 
     @pytest.mark.parametrize("arch", ["gfx942", "gfx90a"])
@@ -189,5 +189,5 @@ def test_negative_values_handling(self, arch):
         }
 
         # Should not crash
-        result = backend._l2_hit_rate()
+        result = backend._metrics["memory.l2_hit_rate"]["compute"]()
         assert isinstance(result, (int, float))