diff --git a/metrix/README.md b/metrix/README.md
index f6e8351..5131827 100644
--- a/metrix/README.md
+++ b/metrix/README.md
@@ -20,6 +20,7 @@ Existing GPU profilers have challenges:
 - **5 Compute Metrics**: FLOPS, arithmetic intensity (HBM/L2/L1), compute throughput
 - **Multi-Run Profiling**: Automatic aggregation with min/max/avg statistics
 - **Kernel Filtering**: Efficient regex filtering at rocprofv3 level
+- **Launch selection**: Optional rocprofv3 ``kernel_iteration_range`` (CLI ``--kernel-iteration`` / ``--kernel-iteration-range``) to target a specific Nth launch when kernels run in a loop or after warmups
 - **Multiple Output Formats**: Text, JSON, CSV
 
 ## Installation
@@ -110,6 +111,10 @@ metrix profile [options] <target>
   --time-only        Only collect timing, no hardware counters
   --kernel, -k       Filter kernels by name (regular expression, passed to rocprofv3)
   --num-replays, -n  Replay the application N times and aggregate (default: 10)
+  --kernel-iteration N
+                     Counters only for the Nth launch of each matched kernel (``[N,N]``)
+  --kernel-iteration-range RANGE
+                     Explicit ``jobs[].kernel_iteration_range`` in rocprofv3 ``--input`` YAML
   --aggregate        Aggregate metrics by kernel name across replays (default: per-dispatch across runs)
   --top K            Show only top K slowest kernels
   --output, -o       Output file (.json, .csv, .txt)
@@ -128,10 +133,20 @@ metrix info <metric|profile> <name>
 
 Note: GPU architecture is auto-detected using `rocminfo`.
 
+**rocprofv3 iteration field:** Metrix passes counters via `rocprofv3 --input <file>`. The launch-index
+window is the YAML key **`kernel_iteration_range`** on each object under top-level **`jobs`**
+(ROCprofiler-SDK input schema), not a separate `rocprofv3` argv flag.
+
+**Dispatch index and `--kernel`:** Launch indices are **per kernel name** (each matched kernel has its
+own 1-based counter). Use **`--kernel`** / **`-k`** whenever you use **`--kernel-iteration`** or
+**`--kernel-iteration-range`** so the Nth launch refers to the kernel you care about; without a
+narrow filter, results include every kernel that matches the default regex.
+
 ## Testing
 
 ```bash
-python3 -m pytest tests/ -v
+python3 -m pytest tests/unit/ -q   # fast, no GPU
+python3 -m pytest tests/ -v        # includes integration (GPU / binaries where applicable)
 ```
 
 ## Requirements
diff --git a/metrix/skill/SKILL.md b/metrix/skill/SKILL.md
index 969ef6e..8f707de 100644
--- a/metrix/skill/SKILL.md
+++ b/metrix/skill/SKILL.md
@@ -42,7 +42,9 @@ metrix --metrics memory.l2_hit_rate,memory.coalescing_efficiency,compute.total_f
 metrix -o results.json ./my_app
 ```
 
-Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list <metrics|profiles|devices>`, `metrix info <metric|profile> <name>`. Note: `metrix list counters` and `metrix info counter <name>` are not implemented yet (CLI reports “not yet implemented”).
+Options: `--profile`/`-p` (run `metrix list profiles` for names: `quick`, `memory`, `memory_bandwidth`, `memory_cache`, `compute`), `--metrics`/`-m`, `--time-only`, `--kernel`/`-k` (regular expression), `--num-replays`/`-n`, `--kernel-iteration` / `--kernel-iteration-range` (YAML `jobs[].kernel_iteration_range`), `--output`/`-o`, `--top`, `--aggregate`, `--timeout`, `--no-counters`, `--log`/`-l`, `--quiet`/`-q`. Discovery: `metrix list <metrics|profiles|devices>`, `metrix info <metric|profile> <name>`. Note: `metrix list counters` and `metrix info counter <name>` are not implemented yet (CLI reports “not yet implemented”).
+
+With `--kernel-iteration*`, always set `--kernel` as well: launch indices are counted **per kernel name**, so a narrow regex keeps “Nth launch” meaningful.
 
 ### Python API
 
@@ -51,6 +53,7 @@ from metrix import Metrix
 
 profiler = Metrix()
 results = profiler.profile("./my_app", num_replays=5)
+# Optional: kernel_iteration_range="[5,5]", kernel_filter="^my_kernel" (indices are per kernel name)
 
 for kernel in results.kernels:
     print(kernel.name, kernel.duration_us.avg)
diff --git a/metrix/src/metrix/api.py b/metrix/src/metrix/api.py
index 02738f2..fb0774a 100644
--- a/metrix/src/metrix/api.py
+++ b/metrix/src/metrix/api.py
@@ -87,6 +87,7 @@ def profile(
         aggregate_by_kernel: bool = True,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> ProfilingResults:
         """
         Profile a command
@@ -109,6 +110,9 @@ def profile(
             aggregate_by_kernel: Aggregate dispatches by kernel name (default: True)
             cwd: Working directory for command execution (default: None)
             timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout)
+            kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string
+                (rocprofv3 ``--input``). E.g. ``"[5,5]"`` for only the 5th launch of each matched
+                kernel. Metrix runs ``num_replays`` profiling passes, each applying this range.
 
         Returns:
             ProfilingResults object with all collected data
@@ -158,6 +162,8 @@ def profile(
         logger.info(f"Collecting {len(metrics_to_compute)} metrics across {num_replays} replay(s)")
         if rocprof_filter:
             logger.info(f"Kernel filter: {rocprof_filter}")
+        if kernel_iteration_range:
+            logger.info(f"Kernel iteration range: {kernel_iteration_range}")
 
         # Profile using backend (filtering at rocprofv3 level)
         logger.debug(f"Calling backend.profile with {len(metrics_to_compute)} metrics")
@@ -169,6 +175,7 @@ def profile(
             kernel_filter=rocprof_filter,
             cwd=cwd,
             timeout_seconds=timeout_seconds,
+            kernel_iteration_range=kernel_iteration_range,
         )
         logger.debug("Backend.profile completed")
 
diff --git a/metrix/src/metrix/backends/base.py b/metrix/src/metrix/backends/base.py
index 65c8cc5..9d5fa96 100644
--- a/metrix/src/metrix/backends/base.py
+++ b/metrix/src/metrix/backends/base.py
@@ -472,7 +472,7 @@ def profile(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
-        use_kernel_iteration_range: bool = False,  # Disabled: rocprofv3 hangs with multiple counter blocks
+        kernel_iteration_range: Optional[str] = None,
     ):
         """
         Profile command with two-level aggregation and multi-pass support
@@ -490,16 +490,25 @@ def profile(
                 in profiling results.
             cwd: Working directory for command execution
             timeout_seconds: Timeout in seconds for profiling (default: 0, None for no timeout)
+            kernel_iteration_range: Optional YAML ``jobs[].kernel_iteration_range`` string passed
+                to rocprofv3 ``--input`` (e.g. ``"[5,5]"`` for the 5th launch of each matched kernel).
+                When set, Metrix runs ``num_replays`` profiling passes, each applying this range.
 
         Returns:
             self (for chaining)
         """
         from ..logger import logger
 
+        # rocprofv3 jobs[].kernel_iteration_range — optional per-kernel launch index window.
+        user_iteration_range: Optional[str] = (
+            kernel_iteration_range.strip() if kernel_iteration_range else None
+        )
+        effective_iteration_range: Optional[str] = user_iteration_range
+
         # Get counters needed
         counters = self.get_required_counters(metrics)
 
-        # Split metrics into category-based batches to avoid rocprofv3 hangs
+        # Split large metric sets into category-based batches (fewer counters per rocprof pass)
         # Group by category (memory.*, proprietary.*, etc.) for better organization
         MAX_METRICS_PER_BATCH = 6
         if len(metrics) > MAX_METRICS_PER_BATCH:
@@ -550,7 +559,7 @@ def profile(
                     kernel_filter=kernel_filter,
                     cwd=cwd,
                     timeout_seconds=timeout_seconds,
-                    use_kernel_iteration_range=use_kernel_iteration_range,
+                    kernel_iteration_range=kernel_iteration_range,
                 )
 
                 # Merge batch results
@@ -628,28 +637,29 @@ def profile(
 
             pass_results = []
 
-            # Use kernel_iteration_range for faster profiling
-            if use_kernel_iteration_range:
-                iteration_range = f"[1,{num_replays}]"
+            if effective_iteration_range is not None:
                 logger.info(
-                    f"  Using kernel_iteration_range={iteration_range} (rocprofv3 internal iterations)"
-                )
-                results = self._run_rocprof(
-                    command,
-                    pass_counters,
-                    kernel_filter,
-                    cwd=cwd,
-                    timeout_seconds=timeout_seconds,
-                    kernel_iteration_range=iteration_range,
+                    f"  Using kernel_iteration_range={effective_iteration_range} "
+                    f"across {num_replays} replay(s)"
                 )
-                # Tag all results with replay_id 0 since rocprofv3 handles iterations
-                for r in results:
-                    r.run_id = 0
-                pass_results.extend(results)
+                for replay_id in range(num_replays):
+                    if num_replays >= 20 and (
+                        replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1
+                    ):
+                        logger.info(f"  Replay {replay_id + 1}/{num_replays}...")
+                    results = self._run_rocprof(
+                        command,
+                        pass_counters,
+                        kernel_filter,
+                        cwd=cwd,
+                        timeout_seconds=timeout_seconds,
+                        kernel_iteration_range=effective_iteration_range,
+                    )
+                    for r in results:
+                        r.run_id = replay_id
+                    pass_results.extend(results)
             else:
-                # Legacy mode: run application multiple times
                 for replay_id in range(num_replays):
-                    # Show progress every 10 replays or at key milestones
                     if num_replays >= 20 and (
                         replay_id == 0 or (replay_id + 1) % 10 == 0 or replay_id == num_replays - 1
                     ):
@@ -662,7 +672,6 @@ def profile(
                         cwd=cwd,
                         timeout_seconds=timeout_seconds,
                     )
-                    # Tag with replay_id for debugging
                     for r in results:
                         r.run_id = replay_id
                     pass_results.extend(results)
@@ -790,6 +799,7 @@ def _run_rocprof(
             kernel_filter: Optional regular expression to filter kernels by name
             cwd: Optional working directory for command execution
             timeout_seconds: Timeout in seconds for profiling (default: 0, zero or None for no timeout)
+            kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[2,4]"``)
 
         Returns:
             List of ProfileResult objects
diff --git a/metrix/src/metrix/backends/gfx90a.py b/metrix/src/metrix/backends/gfx90a.py
index bc38992..7b5d40a 100644
--- a/metrix/src/metrix/backends/gfx90a.py
+++ b/metrix/src/metrix/backends/gfx90a.py
@@ -99,10 +99,17 @@ def _run_rocprof(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> List[ProfileResult]:
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
-        return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
+        return wrapper.profile(
+            command,
+            counters,
+            kernel_filter=kernel_filter,
+            cwd=cwd,
+            kernel_iteration_range=kernel_iteration_range,
+        )
 
     # Memory bandwidth metrics
 
diff --git a/metrix/src/metrix/backends/gfx942.py b/metrix/src/metrix/backends/gfx942.py
index 3391012..dc29632 100644
--- a/metrix/src/metrix/backends/gfx942.py
+++ b/metrix/src/metrix/backends/gfx942.py
@@ -99,10 +99,17 @@ def _run_rocprof(
         kernel_filter: Optional[str] = None,
         cwd: Optional[str] = None,
         timeout_seconds: Optional[int] = 0,
+        kernel_iteration_range: Optional[str] = None,
     ) -> List[ProfileResult]:
         """Run rocprofv3 and return results (single pass only - base class handles multi-pass)"""
         wrapper = ROCProfV3Wrapper(timeout_seconds=timeout_seconds)
-        return wrapper.profile(command, counters, kernel_filter=kernel_filter, cwd=cwd)
+        return wrapper.profile(
+            command,
+            counters,
+            kernel_filter=kernel_filter,
+            cwd=cwd,
+            kernel_iteration_range=kernel_iteration_range,
+        )
 
     # Memory bandwidth metrics
 
diff --git a/metrix/src/metrix/cli/main.py b/metrix/src/metrix/cli/main.py
index ae99015..197c5d1 100644
--- a/metrix/src/metrix/cli/main.py
+++ b/metrix/src/metrix/cli/main.py
@@ -14,6 +14,13 @@
 from .info_cmd import info_command
 
 
+def _positive_int(value: str) -> int:
+    n = int(value)
+    if n < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return n
+
+
 def create_parser():
     """Create argument parser"""
 
@@ -32,6 +39,9 @@ def create_parser():
   # Filter specific kernels (regex)
   metrix profile --profile memory --kernel "matmul.*" ./my_app
 
+  # Only the 10th launch of a repeatedly-dispatched kernel (use with --kernel)
+  metrix profile --kernel "^my_gemm" --kernel-iteration 10 -n 3 ./bench
+
   # List available metrics
   metrix list metrics --category memory
 
@@ -130,6 +140,28 @@ def create_parser():
         help="Aggregate metrics by kernel name across replays (default: per-dispatch across runs)",
     )
 
+    _kernel_iter = profile_parser.add_mutually_exclusive_group()
+    _kernel_iter.add_argument(
+        "--kernel-iteration",
+        type=_positive_int,
+        metavar="N",
+        help=(
+            "Collect hardware counters only for the Nth launch of each kernel matching "
+            "--kernel (YAML jobs[].kernel_iteration_range [N,N]). Use when the app runs the "
+            "same kernel multiple times (warmups, loops) and you want one launch. Combine "
+            "with -n to average across multiple full-app replays."
+        ),
+    )
+    _kernel_iter.add_argument(
+        "--kernel-iteration-range",
+        type=str,
+        metavar="RANGE",
+        help=(
+            "YAML jobs[].kernel_iteration_range passed via rocprofv3 --input, e.g. "
+            '"[1,3]" or "[10,10]". See ROCprofiler-SDK input schema.'
+        ),
+    )
+
     # List command
     list_parser = subparsers.add_parser(
         "list",
diff --git a/metrix/src/metrix/cli/profile_cmd.py b/metrix/src/metrix/cli/profile_cmd.py
index 9ddd17a..2b36673 100644
--- a/metrix/src/metrix/cli/profile_cmd.py
+++ b/metrix/src/metrix/cli/profile_cmd.py
@@ -84,10 +84,26 @@ def profile_command(args):
         logger.info(f"Replays: {args.num_replays}")
     if args.kernel:
         logger.info(f"Filter: {args.kernel}")
+    kernel_iteration = getattr(args, "kernel_iteration", None)
+    kernel_iteration_range = getattr(args, "kernel_iteration_range", None)
+    resolved_iteration_range = None
+    if kernel_iteration is not None:
+        resolved_iteration_range = f"[{kernel_iteration},{kernel_iteration}]"
+    elif kernel_iteration_range:
+        resolved_iteration_range = kernel_iteration_range.strip()
+    if resolved_iteration_range:
+        logger.info(
+            f"Kernel iteration range (jobs[].kernel_iteration_range): {resolved_iteration_range}"
+        )
     logger.info(f"{'=' * 80}")
 
     # Build kernel filter (regular expression, passed through to profiler)
     kernel_filter = args.kernel if args.kernel else None
+    if resolved_iteration_range and not kernel_filter:
+        logger.warning(
+            "Without --kernel, rocprofv3 counts launches separately per kernel name; "
+            "use --kernel to target one kernel when using iteration ranges."
+        )
 
     # Profile using backend (handles multi-replay & aggregation internally!)
     try:
@@ -100,6 +116,8 @@ def profile_command(args):
             num_replays=args.num_replays,
             aggregate_by_kernel=args.aggregate,
             kernel_filter=kernel_filter,
+            timeout_seconds=getattr(args, "timeout", 0),
+            kernel_iteration_range=resolved_iteration_range,
         )
     except Exception as e:
         logger.error(f"Profiling failed: {e}")
diff --git a/metrix/src/metrix/mcp/server.py b/metrix/src/metrix/mcp/server.py
index 2d62c89..f68c8a2 100644
--- a/metrix/src/metrix/mcp/server.py
+++ b/metrix/src/metrix/mcp/server.py
@@ -4,6 +4,8 @@
 
 """MCP Server for Metrix - Human-Readable GPU Metrics."""
 
+from typing import List, Optional
+
 from mcp.server.fastmcp import FastMCP
 
 from metrix import Metrix
@@ -12,7 +14,13 @@
 
 
 @mcp.tool()
-def profile_metrics(command: str, metrics: list[str] = None) -> dict:
+def profile_metrics(
+    command: str,
+    metrics: Optional[List[str]] = None,
+    kernel_filter: Optional[str] = None,
+    kernel_iteration_range: Optional[str] = None,
+    num_replays: int = 1,
+) -> dict:
     """
     Profile GPU application and collect hardware performance metrics.
 
@@ -23,6 +31,9 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict:
     Args:
         command: Command to profile (e.g., './app')
         metrics: List of metrics to collect (default: common metrics)
+        kernel_filter: Optional regex; only matching kernels are profiled
+        kernel_iteration_range: Optional jobs[].kernel_iteration_range string (rocprofv3 --input YAML)
+        num_replays: Number of full-app profiling passes (-n)
 
     Returns:
         Dictionary with kernels list containing metrics and durations
@@ -33,7 +44,13 @@ def profile_metrics(command: str, metrics: list[str] = None) -> dict:
     if metrics is None:
         metrics = ["memory.hbm_bandwidth_utilization"]
 
-    results_obj = profiler.profile(command, metrics=metrics)
+    results_obj = profiler.profile(
+        command,
+        metrics=metrics,
+        kernel_filter=kernel_filter,
+        kernel_iteration_range=kernel_iteration_range,
+        num_replays=num_replays,
+    )
 
     results = {"kernels": []}
 
diff --git a/metrix/src/metrix/profiler/rocprof_wrapper.py b/metrix/src/metrix/profiler/rocprof_wrapper.py
index 13f5315..2feac18 100644
--- a/metrix/src/metrix/profiler/rocprof_wrapper.py
+++ b/metrix/src/metrix/profiler/rocprof_wrapper.py
@@ -98,7 +98,8 @@ def profile(
                   ``".*attention.*"``   - kernels whose names contain "attention"
                   ``"gemm|attention"``  - kernels matching either pattern
             cwd: Optional working directory
-            kernel_iteration_range: Optional iteration range (e.g., "[1,5]" to profile iterations 1-5)
+            kernel_iteration_range: Optional rocprofv3 job field (e.g. ``"[1,5]"`` or ``"[3,3]"``
+                for only the 3rd launch of each kernel matching ``kernel_filter``)
             extra_counters_path: Path to YAML with custom counter definitions (rocprofiler-sdk: section)
             arch: GPU architecture (e.g., "gfx1201") to filter counter definitions
 
@@ -331,6 +332,7 @@ def _create_input_yaml(
             "truncate_kernels": True,
         }
 
+        # ROCprofiler-SDK / rocprofv3 --input YAML: jobs[].kernel_iteration_range (e.g. "[1,5]")
         if kernel_iteration_range:
             job["kernel_iteration_range"] = kernel_iteration_range
 
diff --git a/metrix/tests/unit/test_api.py b/metrix/tests/unit/test_api.py
index 54fbdbc..58b3341 100644
--- a/metrix/tests/unit/test_api.py
+++ b/metrix/tests/unit/test_api.py
@@ -2,6 +2,8 @@
 Unit tests for the high-level Metrix API
 """
 
+from unittest.mock import patch
+
 import pytest
 from metrix.api import Metrix, ProfilingResults, KernelResults
 from metrix.backends import Statistics
@@ -11,10 +13,9 @@ class TestMetrixInit:
     """Test Metrix initialization"""
 
     def test_init_default(self):
-        """Test default initialization (falls back to gfx942 if no hardware detected)"""
+        """Test default initialization (architecture from hardware detection)"""
         profiler = Metrix()
-        # Default depends on hardware detection, but should succeed
-        assert profiler.arch in ["gfx942", "gfx90a", "gfx1201"]
+        assert profiler.arch
         assert profiler.backend is not None
 
     @pytest.mark.parametrize("arch", ["gfx942", "gfx90a"])
@@ -165,3 +166,27 @@ def test_profile_filters_unsupported_in_profile(self):
         assert "memory.atomic_latency" not in filtered
         assert "memory.l2_hit_rate" in filtered
         assert "memory.hbm_bandwidth_utilization" in filtered
+
+
+class TestMetrixProfilePlumbing:
+    """Metrix.profile forwards options to the backend without running rocprof."""
+
+    def test_profile_forwards_kernel_iteration_range_and_replays(self):
+        profiler = Metrix(arch="gfx942")
+        with patch.object(profiler.backend, "profile") as mock_profile:
+            mock_profile.return_value = None
+            out = profiler.profile(
+                "./fake_app",
+                metrics=["memory.l2_hit_rate"],
+                kernel_iteration_range="[3,3]",
+                num_replays=2,
+                kernel_filter=r"^my_kernel",
+            )
+        mock_profile.assert_called_once()
+        kwargs = mock_profile.call_args.kwargs
+        assert kwargs["command"] == "./fake_app"
+        assert kwargs["metrics"] == ["memory.l2_hit_rate"]
+        assert kwargs["kernel_iteration_range"] == "[3,3]"
+        assert kwargs["num_replays"] == 2
+        assert kwargs["kernel_filter"] == r"^my_kernel"
+        assert out.total_kernels == 0
diff --git a/metrix/tests/unit/test_cli_parser.py b/metrix/tests/unit/test_cli_parser.py
new file mode 100644
index 0000000..9c9e48e
--- /dev/null
+++ b/metrix/tests/unit/test_cli_parser.py
@@ -0,0 +1,39 @@
+"""Unit tests for Metrix CLI argument parsing."""
+
+import pytest
+
+from metrix.cli.main import create_parser
+
+
+class TestKernelIterationCLI:
+    def test_kernel_iteration_and_kernel(self):
+        parser = create_parser()
+        args = parser.parse_args(
+            ["profile", "--kernel-iteration", "5", "--kernel", "foo", "./my_app"]
+        )
+        assert args.command == "profile"
+        assert args.kernel_iteration == 5
+        assert args.kernel == "foo"
+        assert args.target == "./my_app"
+
+    def test_kernel_iteration_range(self):
+        parser = create_parser()
+        args = parser.parse_args(
+            ["profile", "--kernel-iteration-range", "[10,10]", "-k", "bar.*", "./app"]
+        )
+        assert args.kernel_iteration_range == "[10,10]"
+        assert args.kernel == "bar.*"
+
+    def test_kernel_iteration_mutually_exclusive_with_range(self):
+        parser = create_parser()
+        with pytest.raises(SystemExit):
+            parser.parse_args(
+                [
+                    "profile",
+                    "--kernel-iteration",
+                    "1",
+                    "--kernel-iteration-range",
+                    "[2,2]",
+                    "./app",
+                ]
+            )
diff --git a/metrix/tests/unit/test_rocprof_wrapper.py b/metrix/tests/unit/test_rocprof_wrapper.py
index 205c7c4..3a17569 100644
--- a/metrix/tests/unit/test_rocprof_wrapper.py
+++ b/metrix/tests/unit/test_rocprof_wrapper.py
@@ -59,6 +59,21 @@ def test_create_input_yaml(self, wrapper):
             assert "TCC_MISS_sum" in content
             assert "SQ_WAVES" in content
 
+    def test_create_input_yaml_kernel_iteration_range(self, wrapper):
+        """kernel_iteration_range is written into the rocprofv3 jobs section."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmppath = Path(tmpdir)
+            counters = ["TCC_HIT_sum"]
+            input_file = wrapper._create_input_yaml(
+                counters,
+                tmppath,
+                kernel_filter="^my_kernel",
+                kernel_iteration_range="[10,10]",
+            )
+            text = input_file.read_text()
+            assert "kernel_iteration_range" in text
+            assert "[10,10]" in text.replace(" ", "")
+
     def test_parse_csv_row(self, wrapper):
         """CSV row parsing works correctly"""
         # Mock CSV row