AMDResearch · mawad-amd · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/metrix/README.md b/metrix/README.md
@@ -20,6 +20,7 @@ Existing GPU profilers have challenges:
 - **5 Compute Metrics**: FLOPS, arithmetic intensity (HBM/L2/L1), compute throughput
 - **Multi-Run Profiling**: Automatic aggregation with min/max/avg statistics
 - **Kernel Filtering**: Efficient regex filtering at rocprofv3 level
+- **Launch selection**: Optional rocprofv3 ``kernel_iteration_range`` (CLI ``--kernel-iteration`` / ``--kernel-iteration-range``) to target a specific Nth launch when kernels run in a loop or after warmups
 - **Multiple Output Formats**: Text, JSON, CSV
 
 ## Installation
@@ -110,6 +111,10 @@ metrix profile [options] <target>
   --time-only        Only collect timing, no hardware counters
   --kernel, -k       Filter kernels by name (regular expression, passed to rocprofv3)
   --num-replays, -n  Replay the application N times and aggregate (default: 10)
+  --kernel-iteration N
+                     Counters only for the Nth launch of each matched kernel (``[N,N]``)
+  --kernel-iteration-range RANGE
+                     Explicit ``jobs[].kernel_iteration_range`` in rocprofv3 ``--input`` YAML
   --aggregate        Aggregate metrics by kernel name across replays (default: per-dispatch across runs)
   --top K            Show only top K slowest kernels
   --output, -o       Output file (.json, .csv, .txt)
@@ -128,10 +133,20 @@ metrix info <metric|profile> <name>
 
 Note: GPU architecture is auto-detected using `rocminfo`.
 
+**rocprofv3 iteration field:** Metrix passes counters via `rocprofv3 --input <file>`. The launch-index
+window is the YAML key **`kernel_iteration_range`** on each object under top-level **`jobs`**
+(ROCprofiler-SDK input schema), not a separate `rocprofv3` argv flag.
+
+**Dispatch index and `--kernel`:** Launch indices are **per kernel name** (each matched kernel has its
+own 1-based counter). Use **`--kernel`** / **`-k`** whenever you use **`--kernel-iteration`** or
+**`--kernel-iteration-range`** so the Nth launch refers to the kernel you care about; without a
+narrow filter, results include every kernel that matches the default regex.
+
 ## Testing
 
 ```bash
-python3 -m pytest tests/ -v
+python3 -m pytest tests/unit/ -q   # fast, no GPU
+python3 -m pytest tests/ -v        # includes integration (GPU / binaries where applicable)
 ```
 
 ## Requirements

diff --git a/metrix/skill/SKILL.md b/metrix/skill/SKILL.md
@@ -42,7 +42,9 @@ metrix --metrics memory.l2_hit_rate,memory.coalescing_efficiency,compute.total_f
 metrix -o results.json ./my_app
 ```
 
-Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list <metrics|profiles|devices>`, `metrix info <metric|profile> <name>`. Note: `metrix list counters` and `metrix info counter <name>` are not implemented yet (CLI reports “not yet implemented”).
+Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--kernel-iteration` / `--kernel-iteration-range` (YAML `jobs[].kernel_iteration_range`), `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list <metrics|profiles|devices>`, `metrix info <metric|profile> <name>`. Note: `metrix list counters` and `metrix info counter <name>` are not implemented yet (CLI reports “not yet implemented”).
+
+With `--kernel-iteration*`, always set `--kernel` as well: launch indices are counted **per kernel name**, so a narrow regex keeps “Nth launch” meaningful.
 
 ### Python API
 
@@ -51,6 +53,7 @@ from metrix import Metrix
 
 profiler = Metrix()
 results = profiler.profile("./my_app", num_replays=5)
+# Optional: kernel_iteration_range="[5,5]", kernel_filter="^my_kernel" (indices are per kernel name)
 
 for kernel in results.kernels:
     print(kernel.name, kernel.duration_us.avg)

diff --git a/metrix/src/metrix/api.py b/metrix/src/metrix/api.py
@@ -87,6 +87,7 @@ def profile(
         aggregate_by_kernel: bool = True,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> ProfilingResults:
         """
         Profile a command
@@ -109,6 +110,9 @@ def profile(
             aggregate_by_kernel: Aggregate dispatches by kernel name (default: True)
             cwd: Working directory for command execution (default: None)
             timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout)
+            kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string
+                (rocprofv3 ``--input``). E.g. ``"[5,5]"`` for only the 5th launch of each matched
+                kernel. Metrix runs ``num_replays`` profiling passes, each applying this range.
 
         Returns:
             ProfilingResults object with all collected data
@@ -158,6 +162,8 @@ def profile(
         logger.info(f"Collecting {len(metrics_to_compute)} metrics across {num_replays} replay(s)")
         if rocprof_filter:
             logger.info(f"Kernel filter: {rocprof_filter}")
+        if kernel_iteration_range:
+            logger.info(f"Kernel iteration range: {kernel_iteration_range}")
 
         # Profile using backend (filtering at rocprofv3 level)
         logger.debug(f"Calling backend.profile with {len(metrics_to_compute)} metrics")
@@ -169,6 +175,7 @@ def profile(
             kernel_filter=rocprof_filter,
             cwd=cwd,
             timeout_seconds=timeout_seconds,
+            kernel_iteration_range=kernel_iteration_range,
         )
         logger.debug("Backend.profile completed")
 

diff --git a/metrix/src/metrix/backends/base.py b/metrix/src/metrix/backends/base.py
@@ -472,7 +472,7 @@ def profile(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
-        use_kernel_iteration_range: bool = False,  # Disabled: rocprofv3 hangs with multiple counter blocks
+        kernel_iteration_range: Optional[str] = None,
     ):
         """
         Profile command with two-level aggregation and multi-pass support
@@ -490,16 +490,25 @@ def profile(
                 in profiling results.
             cwd: Working directory for command execution
             timeout_seconds: Timeout in seconds for profiling (default: 0, None for no timeout)
+            kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string passed
+                to rocprofv3 ``--input`` (e.g. ``"[5,5]"`` for the 5th launch of each matched kernel).
+                When set, Metrix runs ``num_replays`` profiling passes, each applying this range.
 
         Returns:
             self (for chaining)
         """
         from ..logger import logger
 
+        # rocprofv3 jobs[].kernel_iteration_range — optional per-kernel launch index window.
+        user_iteration_range: Optional[str] = (
+            kernel_iteration_range.strip() if kernel_iteration_range else None
+        )
+        effective_iteration_range: Optional[str] = user_iteration_range
+
         # Get counters needed
         counters = self.get_required_counters(metrics)
 
-        # Split metrics into category-based batches to avoid rocprofv3 hangs
+        # Split large metric sets into category-based batches (fewer counters per rocprof pass)
         # Group by category (memory.*, proprietary.*, etc.) for better organization
         MAX_METRICS_PER_BATCH = 6
         if len(metrics) > MAX_METRICS_PER_BATCH:
@@ -550,7 +559,7 @@ def profile(
                     kernel_filter=kernel_filter,
                     cwd=cwd,
                     timeout_seconds=timeout_seconds,
-                    use_kernel_iteration_range=use_kernel_iteration_range,
+                    kernel_iteration_range=kernel_iteration_range,
                 )
 
                 # Merge batch results
@@ -628,28 +637,29 @@ def profile(
 
             pass_results = []
 
-            # Use kernel_iteration_range for faster profiling
-            if use_kernel_iteration_range:
-                iteration_range = f"[1,{num_replays}]"
+            if effective_iteration_range is not None:
                 logger.info(
-                    f"  Using kernel_iteration_range={iteration_range} (rocprofv3 internal iterations)"
-                )
-                results = self._run_rocprof(
-                    command,
-                    pass_counters,
-                    kernel_filter,
-                    cwd=cwd,
-                    timeout_seconds=timeout_seconds,
-                    kernel_iteration_range=iteration_range,
+                    f"  Using kernel_iteration_range={effective_iteration_range} "
+                    f"across {num_replays} replay(s)"
                 )
-                # Tag all results with replay_id 0 since rocprofv3 handles iterations
-                for r in results:
-                    r.run_id = 0
-                pass_results.extend(results)
+                for replay_id in range(num_replays):
+                    if num_replays >= 20 and (
+                        replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1
+                    ):
+                        logger.info(f"  Replay {replay_id + 1}/{num_replays}...")
+                    results = self._run_rocprof(
+                        command,
+                        pass_counters,
+                        kernel_filter,
+                        cwd=cwd,
+                        timeout_seconds=timeout_seconds,
+                        kernel_iteration_range=effective_iteration_range,
+                    )
+                    for r in results:
+                        r.run_id = replay_id
+                    pass_results.extend(results)
             else:
-                # Legacy mode: run application multiple times
                 for replay_id in range(num_replays):
-                    # Show progress every 10 replays or at key milestones
                     if num_replays >= 20 and (
                         replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1
                     ):
@@ -662,7 +672,6 @@ def profile(
                         cwd=cwd,
                         timeout_seconds=timeout_seconds,
                     )
-                    # Tag with replay_id for debugging
                     for r in results:
                         r.run_id = replay_id
                     pass_results.extend(results)
@@ -790,6 +799,7 @@ def _run_rocprof(
             kernel_filter: Optional regular expression to filter kernels by name
             cwd: Optional working directory for command execution
             timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout)
+            kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[2,4]"``)
 
         Returns:
             List of ProfileResult objects

diff --git a/metrix/src/metrix/backends/gfx90a.py b/metrix/src/metrix/backends/gfx90a.py
@@ -99,10 +99,17 @@ def _run_rocprof(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> List[ProfileResult]:
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
-        return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
+        return wrapper.profile(
+            command,
+            counters,
+            kernel_filter=kernel_filter,
+            cwd=cwd,
+            kernel_iteration_range=kernel_iteration_range,
+        )
 
     # Memory bandwidth metrics
 

diff --git a/metrix/src/metrix/backends/gfx942.py b/metrix/src/metrix/backends/gfx942.py
@@ -99,10 +99,17 @@ def _run_rocprof(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> List[ProfileResult]:
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
-        return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
+        return wrapper.profile(
+            command,
+            counters,
+            kernel_filter=kernel_filter,
+            cwd=cwd,
+            kernel_iteration_range=kernel_iteration_range,
+        )
 
     # Memory bandwidth metrics
 

diff --git a/metrix/src/metrix/cli/main.py b/metrix/src/metrix/cli/main.py
@@ -14,6 +14,13 @@
 from .info_cmd import info_command
 
 
+def _positive_int(value: str) -> int:
+    n = int(value)
+    if n < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return n
+
+
 def create_parser():
     """Create argument parser"""
 
@@ -32,6 +39,9 @@ def create_parser():
   # Filter specific kernels (regex)
   metrix profile --profile memory --kernel "matmul.*" ./my_app
 
+  # Only the 10th launch of a repeatedly-dispatched kernel (use with --kernel)
+  metrix profile --kernel "^my_gemm" --kernel-iteration 10 -n 3 ./bench
+
   # List available metrics
   metrix list metrics --category memory
 
@@ -130,6 +140,28 @@ def create_parser():
         help="Aggregate metrics by kernel name across replays (default: per-dispatch across runs)",
     )
 
+    _kernel_iter = profile_parser.add_mutually_exclusive_group()
+    _kernel_iter.add_argument(
+        "--kernel-iteration",
+        type=_positive_int,
+        metavar="N",
+        help=(
+            "Collect hardware counters only for the Nth launch of each kernel matching "
+            "--kernel (YAML jobs[].kernel_iteration_range [N,N]). Use when the app runs the "
+            "same kernel multiple times (warmups, loops) and you want one launch. Combine "
+            "with -n to average across multiple full-app replays."
+        ),
+    )
+    _kernel_iter.add_argument(
+        "--kernel-iteration-range",
+        type=str,
+        metavar="RANGE",
+        help=(
+            "YAML jobs[].kernel_iteration_range passed via rocprofv3 --input, e.g. "
+            '"[1,3]" or "[10,10]". See ROCprofiler-SDK input schema.'
+        ),
+    )
+
     # List command
     list_parser = subparsers.add_parser(
         "list",

diff --git a/metrix/src/metrix/cli/profile_cmd.py b/metrix/src/metrix/cli/profile_cmd.py
@@ -84,10 +84,26 @@ def profile_command(args):
         logger.info(f"Replays: {args.num_replays}")
     if args.kernel:
         logger.info(f"Filter: {args.kernel}")
+    kernel_iteration = getattr(args, "kernel_iteration", None)
+    kernel_iteration_range = getattr(args, "kernel_iteration_range", None)
+    resolved_iteration_range = None
+    if kernel_iteration is not None:
+        resolved_iteration_range = f"[{kernel_iteration},{kernel_iteration}]"
+    elif kernel_iteration_range:
+        resolved_iteration_range = kernel_iteration_range.strip()
+    if resolved_iteration_range:
+        logger.info(
+            f"Kernel iteration range (jobs[].kernel_iteration_range): {resolved_iteration_range}"
+        )
     logger.info(f"{'=' * 80}")
 
     # Build kernel filter (regular expression, passed through to profiler)
     kernel_filter = args.kernel if args.kernel else None
+    if resolved_iteration_range and not kernel_filter:
+        logger.warning(
+            "Without --kernel, rocprofv3 counts launches separately per kernel name; "
+            "use --kernel to target one kernel when using iteration ranges."
+        )
 
     # Profile using backend (handles multi-replay & aggregation internally!)
     try:
@@ -100,6 +116,8 @@ def profile_command(args):
             num_replays=args.num_replays,
             aggregate_by_kernel=args.aggregate,
             kernel_filter=kernel_filter,
+            timeout_seconds=getattr(args, "timeout", 0),
+            kernel_iteration_range=resolved_iteration_range,
         )
     except Exception as e:
         logger.error(f"Profiling failed: {e}")

diff --git a/metrix/src/metrix/mcp/server.py b/metrix/src/metrix/mcp/server.py
@@ -4,6 +4,8 @@
 
 """MCP Server for Metrix - Human-Readable GPU Metrics."""
 
+from typing import List, Optional
+
 from mcp.server.fastmcp import FastMCP
 
 from metrix import Metrix
@@ -12,7 +14,13 @@
 
 
 @mcp.tool()
-def profile_metrics(command: str, metrics: list[str] = None) -> dict:
+def profile_metrics(
+    command: str,
+    metrics: Optional[List[str]] = None,
+    kernel_filter: Optional[str] = None,
+    kernel_iteration_range: Optional[str] = None,
+    num_replays: int = 1,
+) -> dict:
     """
     Profile GPU application and collect hardware performance metrics.
 
@@ -23,6 +31,9 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict:
     Args:
         command: Command to profile (e.g., './app')
         metrics: List of metrics to collect (default: common metrics)
+        kernel_filter: Optional regex; only matching kernels are profiled
+        kernel_iteration_range: Optional jobs[].kernel_iteration_range string (rocprofv3 --input YAML)
+        num_replays: Number of full-app profiling passes (-n)
 
     Returns:
         Dictionary with kernels list containing metrics and durations
@@ -33,7 +44,13 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict:
     if metrics is None:
         metrics = ["memory.hbm_bandwidth_utilization"]
 
-    results_obj = profiler.profile(command, metrics=metrics)
+    results_obj = profiler.profile(
+        command,
+        metrics=metrics,
+        kernel_filter=kernel_filter,
+        kernel_iteration_range=kernel_iteration_range,
+        num_replays=num_replays,
+    )
 
     results = {"kernels": []}
 

diff --git a/metrix/src/metrix/profiler/rocprof_wrapper.py b/metrix/src/metrix/profiler/rocprof_wrapper.py
@@ -98,7 +98,8 @@ def profile(
                   ``".*attention.*"``   - kernels whose names contain "attention"
                   ``"gemm|attention"``  - kernels matching either pattern
             cwd: Optional working directory
-            kernel_iteration_range: Optional iteration range (e.g., "[1,5]" to profile iterations 1-5)
+            kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[1,5]"`` or ``"[3,3]"``
+                for only the 3rd launch of each kernel matching ``kernel_filter``)
             extra_counters_path: Path to YAML with custom counter definitions (rocprofiler-sdk: section)
             arch: GPU architecture (e.g., "gfx1201") to filter counter definitions
 
@@ -331,6 +332,7 @@ def _create_input_yaml(
             "truncate_kernels": True,
         }
 
+        # ROCprofiler-SDK / rocprofv3 --input YAML: jobs[].kernel_iteration_range (e.g. "[1,5]")
         if kernel_iteration_range:
             job["kernel_iteration_range"] = kernel_iteration_range