diff --git a/metrix/README.md b/metrix/README.md index f6e8351..5131827 100644 --- a/metrix/README.md +++ b/metrix/README.md @@ -20,6 +20,7 @@ Existing GPU profilers have challenges: - **5 Compute Metrics**: FLOPS, arithmetic intensity (HBM/L2/L1), compute throughput - **Multi-Run Profiling**: Automatic aggregation with min/max/avg statistics - **Kernel Filtering**: Efficient regex filtering at rocprofv3 level +- **Launch selection**: Optional rocprofv3 ``kernel_iteration_range`` (CLI ``--kernel-iteration`` / ``--kernel-iteration-range``) to target a specific Nth launch when kernels run in a loop or after warmups - **Multiple Output Formats**: Text, JSON, CSV ## Installation @@ -110,6 +111,10 @@ metrix profile [options] --time-only Only collect timing, no hardware counters --kernel, -k Filter kernels by name (regular expression, passed to rocprofv3) --num-replays, -n Replay the application N times and aggregate (default: 10) + --kernel-iteration N + Counters only for the Nth launch of each matched kernel (``[N,N]``) + --kernel-iteration-range RANGE + Explicit ``jobs[].kernel_iteration_range`` in rocprofv3 ``--input`` YAML --aggregate Aggregate metrics by kernel name across replays (default: per-dispatch across runs) --top K Show only top K slowest kernels --output, -o Output file (.json, .csv, .txt) @@ -128,10 +133,20 @@ metrix info Note: GPU architecture is auto-detected using `rocminfo`. +**rocprofv3 iteration field:** Metrix passes counters via `rocprofv3 --input `. The launch-index +window is the YAML key **`kernel_iteration_range`** on each object under top-level **`jobs`** +(ROCprofiler-SDK input schema), not a separate `rocprofv3` argv flag. + +**Dispatch index and `--kernel`:** Launch indices are **per kernel name** (each matched kernel has its +own 1-based counter). Use **`--kernel`** / **`-k`** whenever you use **`--kernel-iteration`** or +**`--kernel-iteration-range`** so the Nth launch refers to the kernel you care about; without a +narrow filter, results include every kernel that matches the default regex. + ## Testing ```bash -python3 -m pytest tests/ -v +python3 -m pytest tests/unit/ -q # fast, no GPU +python3 -m pytest tests/ -v # includes integration (GPU / binaries where applicable) ``` ## Requirements diff --git a/metrix/skill/SKILL.md b/metrix/skill/SKILL.md index 969ef6e..8f707de 100644 --- a/metrix/skill/SKILL.md +++ b/metrix/skill/SKILL.md @@ -42,7 +42,9 @@ metrix --metrics memory.l2_hit_rate,memory.coalescing_efficiency,compute.total_f metrix -o results.json ./my_app ``` -Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list `, `metrix info `. Note: `metrix list counters` and `metrix info counter ` are not implemented yet (CLI reports “not yet implemented”). +Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--kernel-iteration` / `--kernel-iteration-range` (YAML `jobs[].kernel_iteration_range`), `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list `, `metrix info `. Note: `metrix list counters` and `metrix info counter ` are not implemented yet (CLI reports “not yet implemented”). + +With `--kernel-iteration*`, always set `--kernel` as well: launch indices are counted **per kernel name**, so a narrow regex keeps “Nth launch” meaningful. ### Python API @@ -51,6 +53,7 @@ from metrix import Metrix profiler = Metrix() results = profiler.profile("./my_app", num_replays=5) +# Optional: kernel_iteration_range="[5,5]", kernel_filter="^my_kernel" (indices are per kernel name) for kernel in results.kernels: print(kernel.name, kernel.duration_us.avg) diff --git a/metrix/src/metrix/api.py b/metrix/src/metrix/api.py index 02738f2..fb0774a 100644 --- a/metrix/src/metrix/api.py +++ b/metrix/src/metrix/api.py @@ -87,6 +87,7 @@ def profile( aggregate_by_kernel: bool = True, cwd: Optional[str] = None, timeout_seconds: Optional[int] = 0, + kernel_iteration_range: Optional[str] = None, ) -> ProfilingResults: """ Profile a command @@ -109,6 +110,9 @@ def profile( aggregate_by_kernel: Aggregate dispatches by kernel name (default: True) cwd: Working directory for command execution (default: None) timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout) + kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string + (rocprofv3 ``--input``). E.g. ``"[5,5]"`` for only the 5th launch of each matched + kernel. Metrix runs ``num_replays`` profiling passes, each applying this range. Returns: ProfilingResults object with all collected data @@ -158,6 +162,8 @@ def profile( logger.info(f"Collecting {len(metrics_to_compute)} metrics across {num_replays} replay(s)") if rocprof_filter: logger.info(f"Kernel filter: {rocprof_filter}") + if kernel_iteration_range: + logger.info(f"Kernel iteration range: {kernel_iteration_range}") # Profile using backend (filtering at rocprofv3 level) logger.debug(f"Calling backend.profile with {len(metrics_to_compute)} metrics") @@ -169,6 +175,7 @@ def profile( kernel_filter=rocprof_filter, cwd=cwd, timeout_seconds=timeout_seconds, + kernel_iteration_range=kernel_iteration_range, ) logger.debug("Backend.profile completed") diff --git a/metrix/src/metrix/backends/base.py b/metrix/src/metrix/backends/base.py index 65c8cc5..9d5fa96 100644 --- a/metrix/src/metrix/backends/base.py +++ b/metrix/src/metrix/backends/base.py @@ -472,7 +472,7 @@ def profile( kernel_filter: Optional[str] = None, cwd: Optional[str] = None, timeout_seconds: Optional[int] = 0, - use_kernel_iteration_range: bool = False, # Disabled: rocprofv3 hangs with multiple counter blocks + kernel_iteration_range: Optional[str] = None, ): """ Profile command with two-level aggregation and multi-pass support @@ -490,16 +490,25 @@ def profile( in profiling results. cwd: Working directory for command execution timeout_seconds: Timeout in seconds for profiling (default: 0, None for no timeout) + kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string passed + to rocprofv3 ``--input`` (e.g. ``"[5,5]"`` for the 5th launch of each matched kernel). + When set, Metrix runs ``num_replays`` profiling passes, each applying this range. Returns: self (for chaining) """ from ..logger import logger + # rocprofv3 jobs[].kernel_iteration_range — optional per-kernel launch index window. + user_iteration_range: Optional[str] = ( + kernel_iteration_range.strip() if kernel_iteration_range else None + ) + effective_iteration_range: Optional[str] = user_iteration_range + # Get counters needed counters = self.get_required_counters(metrics) - # Split metrics into category-based batches to avoid rocprofv3 hangs + # Split large metric sets into category-based batches (fewer counters per rocprof pass) # Group by category (memory.*, proprietary.*, etc.) for better organization MAX_METRICS_PER_BATCH = 6 if len(metrics) > MAX_METRICS_PER_BATCH: @@ -550,7 +559,7 @@ def profile( kernel_filter=kernel_filter, cwd=cwd, timeout_seconds=timeout_seconds, - use_kernel_iteration_range=use_kernel_iteration_range, + kernel_iteration_range=kernel_iteration_range, ) # Merge batch results @@ -628,28 +637,29 @@ def profile( pass_results = [] - # Use kernel_iteration_range for faster profiling - if use_kernel_iteration_range: - iteration_range = f"[1,{num_replays}]" + if effective_iteration_range is not None: logger.info( - f" Using kernel_iteration_range={iteration_range} (rocprofv3 internal iterations)" - ) - results = self._run_rocprof( - command, - pass_counters, - kernel_filter, - cwd=cwd, - timeout_seconds=timeout_seconds, - kernel_iteration_range=iteration_range, + f" Using kernel_iteration_range={effective_iteration_range} " + f"across {num_replays} replay(s)" ) - # Tag all results with replay_id 0 since rocprofv3 handles iterations - for r in results: - r.run_id = 0 - pass_results.extend(results) + for replay_id in range(num_replays): + if num_replays >= 20 and ( + replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1 + ): + logger.info(f" Replay {replay_id + 1}/{num_replays}...") + results = self._run_rocprof( + command, + pass_counters, + kernel_filter, + cwd=cwd, + timeout_seconds=timeout_seconds, + kernel_iteration_range=effective_iteration_range, + ) + for r in results: + r.run_id = replay_id + pass_results.extend(results) else: - # Legacy mode: run application multiple times for replay_id in range(num_replays): - # Show progress every 10 replays or at key milestones if num_replays >= 20 and ( replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1 ): @@ -662,7 +672,6 @@ def profile( cwd=cwd, timeout_seconds=timeout_seconds, ) - # Tag with replay_id for debugging for r in results: r.run_id = replay_id pass_results.extend(results) @@ -790,6 +799,7 @@ def _run_rocprof( kernel_filter: Optional regular expression to filter kernels by name cwd: Optional working directory for command execution timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout) + kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[2,4]"``) Returns: List of ProfileResult objects diff --git a/metrix/src/metrix/backends/gfx90a.py b/metrix/src/metrix/backends/gfx90a.py index bc38992..7b5d40a 100644 --- a/metrix/src/metrix/backends/gfx90a.py +++ b/metrix/src/metrix/backends/gfx90a.py @@ -99,10 +99,17 @@ def _run_rocprof( kernel_filter: Optional[str] = None, cwd: Optional[str] = None, timeout_seconds: Optional[int] = 0, + kernel_iteration_range: Optional[str] = None, ) -> List[ProfileResult]: """Run rocprofv3 and return results (single pass only - base class handles multi-pass)""" wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds) - return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd) + return wrapper.profile( + command, + counters, + kernel_filter=kernel_filter, + cwd=cwd, + kernel_iteration_range=kernel_iteration_range, + ) # Memory bandwidth metrics diff --git a/metrix/src/metrix/backends/gfx942.py b/metrix/src/metrix/backends/gfx942.py index 3391012..dc29632 100644 --- a/metrix/src/metrix/backends/gfx942.py +++ b/metrix/src/metrix/backends/gfx942.py @@ -99,10 +99,17 @@ def _run_rocprof( kernel_filter: Optional[str] = None, cwd: Optional[str] = None, timeout_seconds: Optional[int] = 0, + kernel_iteration_range: Optional[str] = None, ) -> List[ProfileResult]: """Run rocprofv3 and return results (single pass only - base class handles multi-pass)""" wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds) - return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd) + return wrapper.profile( + command, + counters, + kernel_filter=kernel_filter, + cwd=cwd, + kernel_iteration_range=kernel_iteration_range, + ) # Memory bandwidth metrics diff --git a/metrix/src/metrix/cli/main.py b/metrix/src/metrix/cli/main.py index ae99015..197c5d1 100644 --- a/metrix/src/metrix/cli/main.py +++ b/metrix/src/metrix/cli/main.py @@ -14,6 +14,13 @@ from .info_cmd import info_command +def _positive_int(value: str) -> int: + n = int(value) + if n < 1: + raise argparse.ArgumentTypeError("must be >= 1") + return n + + def create_parser(): """Create argument parser""" @@ -32,6 +39,9 @@ def create_parser(): # Filter specific kernels (regex) metrix profile --profile memory --kernel "matmul.*" ./my_app + # Only the 10th launch of a repeatedly-dispatched kernel (use with --kernel) + metrix profile --kernel "^my_gemm" --kernel-iteration 10 -n 3 ./bench + # List available metrics metrix list metrics --category memory @@ -130,6 +140,28 @@ def create_parser(): help="Aggregate metrics by kernel name across replays (default: per-dispatch across runs)", ) + _kernel_iter = profile_parser.add_mutually_exclusive_group() + _kernel_iter.add_argument( + "--kernel-iteration", + type=_positive_int, + metavar="N", + help=( + "Collect hardware counters only for the Nth launch of each kernel matching " + "--kernel (YAML jobs[].kernel_iteration_range [N,N]). Use when the app runs the " + "same kernel multiple times (warmups, loops) and you want one launch. Combine " + "with -n to average across multiple full-app replays." + ), + ) + _kernel_iter.add_argument( + "--kernel-iteration-range", + type=str, + metavar="RANGE", + help=( + "YAML jobs[].kernel_iteration_range passed via rocprofv3 --input, e.g. " + '"[1,3]" or "[10,10]". See ROCprofiler-SDK input schema.' + ), + ) + # List command list_parser = subparsers.add_parser( "list", diff --git a/metrix/src/metrix/cli/profile_cmd.py b/metrix/src/metrix/cli/profile_cmd.py index 9ddd17a..2b36673 100644 --- a/metrix/src/metrix/cli/profile_cmd.py +++ b/metrix/src/metrix/cli/profile_cmd.py @@ -84,10 +84,26 @@ def profile_command(args): logger.info(f"Replays: {args.num_replays}") if args.kernel: logger.info(f"Filter: {args.kernel}") + kernel_iteration = getattr(args, "kernel_iteration", None) + kernel_iteration_range = getattr(args, "kernel_iteration_range", None) + resolved_iteration_range = None + if kernel_iteration is not None: + resolved_iteration_range = f"[{kernel_iteration},{kernel_iteration}]" + elif kernel_iteration_range: + resolved_iteration_range = kernel_iteration_range.strip() + if resolved_iteration_range: + logger.info( + f"Kernel iteration range (jobs[].kernel_iteration_range): {resolved_iteration_range}" + ) logger.info(f"{'=' * 80}") # Build kernel filter (regular expression, passed through to profiler) kernel_filter = args.kernel if args.kernel else None + if resolved_iteration_range and not kernel_filter: + logger.warning( + "Without --kernel, rocprofv3 counts launches separately per kernel name; " + "use --kernel to target one kernel when using iteration ranges." + ) # Profile using backend (handles multi-replay & aggregation internally!) try: @@ -100,6 +116,8 @@ def profile_command(args): num_replays=args.num_replays, aggregate_by_kernel=args.aggregate, kernel_filter=kernel_filter, + timeout_seconds=getattr(args, "timeout", 0), + kernel_iteration_range=resolved_iteration_range, ) except Exception as e: logger.error(f"Profiling failed: {e}") diff --git a/metrix/src/metrix/mcp/server.py b/metrix/src/metrix/mcp/server.py index 2d62c89..f68c8a2 100644 --- a/metrix/src/metrix/mcp/server.py +++ b/metrix/src/metrix/mcp/server.py @@ -4,6 +4,8 @@ """MCP Server for Metrix - Human-Readable GPU Metrics.""" +from typing import List, Optional + from mcp.server.fastmcp import FastMCP from metrix import Metrix @@ -12,7 +14,13 @@ @mcp.tool() -def profile_metrics(command: str, metrics: list[str] = None) -> dict: +def profile_metrics( + command: str, + metrics: Optional[List[str]] = None, + kernel_filter: Optional[str] = None, + kernel_iteration_range: Optional[str] = None, + num_replays: int = 1, +) -> dict: """ Profile GPU application and collect hardware performance metrics. @@ -23,6 +31,9 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict: Args: command: Command to profile (e.g., './app') metrics: List of metrics to collect (default: common metrics) + kernel_filter: Optional regex; only matching kernels are profiled + kernel_iteration_range: Optional jobs[].kernel_iteration_range string (rocprofv3 --input YAML) + num_replays: Number of full-app profiling passes (-n) Returns: Dictionary with kernels list containing metrics and durations @@ -33,7 +44,13 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict: if metrics is None: metrics = ["memory.hbm_bandwidth_utilization"] - results_obj = profiler.profile(command, metrics=metrics) + results_obj = profiler.profile( + command, + metrics=metrics, + kernel_filter=kernel_filter, + kernel_iteration_range=kernel_iteration_range, + num_replays=num_replays, + ) results = {"kernels": []} diff --git a/metrix/src/metrix/profiler/rocprof_wrapper.py b/metrix/src/metrix/profiler/rocprof_wrapper.py index 13f5315..2feac18 100644 --- a/metrix/src/metrix/profiler/rocprof_wrapper.py +++ b/metrix/src/metrix/profiler/rocprof_wrapper.py @@ -98,7 +98,8 @@ def profile( ``".*attention.*"`` - kernels whose names contain "attention" ``"gemm|attention"`` - kernels matching either pattern cwd: Optional working directory - kernel_iteration_range: Optional iteration range (e.g., "[1,5]" to profile iterations 1-5) + kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[1,5]"`` or ``"[3,3]"`` + for only the 3rd launch of each kernel matching ``kernel_filter``) extra_counters_path: Path to YAML with custom counter definitions (rocprofiler-sdk: section) arch: GPU architecture (e.g., "gfx1201") to filter counter definitions @@ -331,6 +332,7 @@ def _create_input_yaml( "truncate_kernels": True, } + # ROCprofiler-SDK / rocprofv3 --input YAML: jobs[].kernel_iteration_range (e.g. "[1,5]") if kernel_iteration_range: job["kernel_iteration_range"] = kernel_iteration_range diff --git a/metrix/tests/unit/test_api.py b/metrix/tests/unit/test_api.py index 54fbdbc..58b3341 100644 --- a/metrix/tests/unit/test_api.py +++ b/metrix/tests/unit/test_api.py @@ -2,6 +2,8 @@ Unit tests for the high-level Metrix API """ +from unittest.mock import patch + import pytest from metrix.api import Metrix, ProfilingResults, KernelResults from metrix.backends import Statistics @@ -11,10 +13,9 @@ class TestMetrixInit: """Test Metrix initialization""" def test_init_default(self): - """Test default initialization (falls back to gfx942 if no hardware detected)""" + """Test default initialization (architecture from hardware detection)""" profiler = Metrix() - # Default depends on hardware detection, but should succeed - assert profiler.arch in ["gfx942", "gfx90a", "gfx1201"] + assert profiler.arch assert profiler.backend is not None @pytest.mark.parametrize("arch", ["gfx942", "gfx90a"]) @@ -165,3 +166,27 @@ def test_profile_filters_unsupported_in_profile(self): assert "memory.atomic_latency" not in filtered assert "memory.l2_hit_rate" in filtered assert "memory.hbm_bandwidth_utilization" in filtered + + +class TestMetrixProfilePlumbing: + """Metrix.profile forwards options to the backend without running rocprof.""" + + def test_profile_forwards_kernel_iteration_range_and_replays(self): + profiler = Metrix(arch="gfx942") + with patch.object(profiler.backend, "profile") as mock_profile: + mock_profile.return_value = None + out = profiler.profile( + "./fake_app", + metrics=["memory.l2_hit_rate"], + kernel_iteration_range="[3,3]", + num_replays=2, + kernel_filter=r"^my_kernel", + ) + mock_profile.assert_called_once() + kwargs = mock_profile.call_args.kwargs + assert kwargs["command"] == "./fake_app" + assert kwargs["metrics"] == ["memory.l2_hit_rate"] + assert kwargs["kernel_iteration_range"] == "[3,3]" + assert kwargs["num_replays"] == 2 + assert kwargs["kernel_filter"] == r"^my_kernel" + assert out.total_kernels == 0 diff --git a/metrix/tests/unit/test_cli_parser.py b/metrix/tests/unit/test_cli_parser.py new file mode 100644 index 0000000..9c9e48e --- /dev/null +++ b/metrix/tests/unit/test_cli_parser.py @@ -0,0 +1,39 @@ +"""Unit tests for Metrix CLI argument parsing.""" + +import pytest + +from metrix.cli.main import create_parser + + +class TestKernelIterationCLI: + def test_kernel_iteration_and_kernel(self): + parser = create_parser() + args = parser.parse_args( + ["profile", "--kernel-iteration", "5", "--kernel", "foo", "./my_app"] + ) + assert args.command == "profile" + assert args.kernel_iteration == 5 + assert args.kernel == "foo" + assert args.target == "./my_app" + + def test_kernel_iteration_range(self): + parser = create_parser() + args = parser.parse_args( + ["profile", "--kernel-iteration-range", "[10,10]", "-k", "bar.*", "./app"] + ) + assert args.kernel_iteration_range == "[10,10]" + assert args.kernel == "bar.*" + + def test_kernel_iteration_mutually_exclusive_with_range(self): + parser = create_parser() + with pytest.raises(SystemExit): + parser.parse_args( + [ + "profile", + "--kernel-iteration", + "1", + "--kernel-iteration-range", + "[2,2]", + "./app", + ] + ) diff --git a/metrix/tests/unit/test_rocprof_wrapper.py b/metrix/tests/unit/test_rocprof_wrapper.py index 205c7c4..3a17569 100644 --- a/metrix/tests/unit/test_rocprof_wrapper.py +++ b/metrix/tests/unit/test_rocprof_wrapper.py @@ -59,6 +59,21 @@ def test_create_input_yaml(self, wrapper): assert "TCC_MISS_sum" in content assert "SQ_WAVES" in content + def test_create_input_yaml_kernel_iteration_range(self, wrapper): + """kernel_iteration_range is written into the rocprofv3 jobs section.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + counters = ["TCC_HIT_sum"] + input_file = wrapper._create_input_yaml( + counters, + tmppath, + kernel_filter="^my_kernel", + kernel_iteration_range="[10,10]", + ) + text = input_file.read_text() + assert "kernel_iteration_range" in text + assert "[10,10]" in text.replace(" ", "") + def test_parse_csv_row(self, wrapper): """CSV row parsing works correctly""" # Mock CSV row