diff --git a/.gitignore b/.gitignore index 253938e..29a3c4d 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,8 @@ cover/ *.csv *.ncu-rep local_settings.py +*.local +*.local.* db.sqlite3 db.sqlite3-journal diff --git a/modeling/transformers/Dockerfile b/modeling/transformers/Dockerfile index 7a29928..5e766ec 100644 --- a/modeling/transformers/Dockerfile +++ b/modeling/transformers/Dockerfile @@ -21,8 +21,14 @@ ARG TORCH_VERSION=2.9.1 ENV PYTHONUNBUFFERED=1 -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ +# Install system dependencies and NSight Systems CLI. +RUN \ + apt-get update && \ + apt-get install -y --no-install-recommends gnupg && \ + echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2204/amd64/ /" | tee /etc/apt/sources.list.d/nvidia-devtools.list && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/devtools/repos/ubuntu2204/amd64/nvidia.pub && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ python3-pip \ python3-dev \ python-is-python3 \ @@ -30,7 +36,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ curl \ build-essential \ - && rm -rf /var/lib/apt/lists/* + nsight-systems-cli && \ + rm -rf /var/lib/apt/lists/* # Upgrade pip RUN python -m pip install --upgrade pip setuptools wheel diff --git a/modeling/transformers/README.md b/modeling/transformers/README.md index 24a479f..ef4594e 100644 --- a/modeling/transformers/README.md +++ b/modeling/transformers/README.md @@ -88,6 +88,38 @@ python infer.py \ --num_runs 5 ``` +### Kernel Coverage Report + +Report the fraction of GPU time and kernel launches covered by TileGym cuTile kernels. Runs the model under NSight Systems (`nsys profile`) and analyzes the trace automatically. + +```bash +python infer.py \ + --model_id meta-llama/Meta-Llama-3.1-8B \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file sample_inputs/input_prompt_32K.txt \ + --output_length 100 +``` + +Example output: +```text +===== NSYS KERNEL GPU TIME ANALYSIS ===== + +Kernel Name # Calls GPU Time (ms) % of Total +------------------------------------------------------------ -------- ------------- ---------- +fmha_kernel ... 54.507 10.5% +rms_norm_kernel_gather ... 9.788 1.9% +... +------------------------------------------------------------ -------- ------------- ---------- +TileGym Total 9676 95.147 18.3% +All Kernels Total 104858 520.725 100.0% + +>>> cuTile Kernel Coverage (GPU Time): 18.3% <<< +>>> cuTile Kernel Coverage (# Launches): 9.2% <<< +``` + ## Performance Benchmark Benchmark TileGym's CUTILE-optimized kernels against standard PyTorch implementation. The `--profile` flag enables detailed performance metrics including throughput (tokens/sec) and generation latency. @@ -241,6 +273,7 @@ python infer.py \ | `--num_runs` | Benchmark iterations | `5` | | `--warmup_runs` | Warmup iterations | `2` | | `--profile` | Enable profiling | `False` | +| `--report_kernel_coverage` | Report cuTile kernel GPU time and launch count coverage via nsys | `False` | | `--show_outputs` | Print generated text | `False` | diff --git a/modeling/transformers/bench_deepseek.sh b/modeling/transformers/bench_deepseek.sh index 36d1b1e..c86e38d 100755 --- a/modeling/transformers/bench_deepseek.sh +++ b/modeling/transformers/bench_deepseek.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_gemma3.sh b/modeling/transformers/bench_gemma3.sh index da1ef2f..688e51e 100644 --- a/modeling/transformers/bench_gemma3.sh +++ b/modeling/transformers/bench_gemma3.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_gpt_oss.sh b/modeling/transformers/bench_gpt_oss.sh index 5d2f5c9..798bcee 100644 --- a/modeling/transformers/bench_gpt_oss.sh +++ b/modeling/transformers/bench_gpt_oss.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_llama.sh b/modeling/transformers/bench_llama.sh index cce2ad6..4704184 100755 --- a/modeling/transformers/bench_llama.sh +++ b/modeling/transformers/bench_llama.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_mistral.sh b/modeling/transformers/bench_mistral.sh index a99b4d5..5f274ab 100755 --- a/modeling/transformers/bench_mistral.sh +++ b/modeling/transformers/bench_mistral.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_phi3.sh b/modeling/transformers/bench_phi3.sh index fdd6b3b..186bb75 100755 --- a/modeling/transformers/bench_phi3.sh +++ b/modeling/transformers/bench_phi3.sh @@ -55,3 +55,17 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/bench_qwen.sh b/modeling/transformers/bench_qwen.sh index 59f2ce3..b90ef6b 100755 --- a/modeling/transformers/bench_qwen.sh +++ b/modeling/transformers/bench_qwen.sh @@ -59,3 +59,18 @@ else echo "Summary file not found." fi echo "========================================" + +echo "" +echo "========================================" +echo " TileGym Kernel Coverage" +echo "========================================" +python infer.py \ + --model_id ${MODEL_ID} \ + --use_tilegym \ + --use_cutile \ + --use_attn \ + --report_kernel_coverage \ + --sentence_file ${INPUT_FILE} \ + --batch_size ${BATCH_SIZE} \ + --output_length ${OUTPUT_LENGTH} +echo "========================================" diff --git a/modeling/transformers/infer.py b/modeling/transformers/infer.py index 0434ce1..f3b810d 100644 --- a/modeling/transformers/infer.py +++ b/modeling/transformers/infer.py @@ -4,8 +4,14 @@ import argparse import datetime +import glob import os +import shlex +import sqlite3 +import subprocess +import sys import zipfile +from collections import defaultdict from pathlib import Path import numpy as np @@ -185,6 +191,11 @@ def parse_args(): # If mock_input_len > 0, then the input length will be set to mock_input_len, this is used to mock the input length # If you use this, you may not get the correct answer of your sentence parser.add_argument("--mock_input_len", type=int, default=0, help="Mock input length") + parser.add_argument( + "--report_kernel_coverage", + action="store_true", + help="Run under nsys profiler and report cuTile kernel coverage (GPU time and launch count ratios)", + ) return parser.parse_args() @@ -278,9 +289,248 @@ def contains(self, key): return False +class NsysKernelCoverageReporter: + """Runs nsys profiling and reports cuTile kernel coverage (GPU time and launch count ratios).""" + + def __init__(self, args): + self.args = args + self.kernel_filter = KernelFilter() + self.log_dir = args.log_dir + self.model_name = args.model_id.split("/")[-1] + + def run(self): + """Launch infer.py under nsys profile and report kernel coverage.""" + inner_args = self._build_inner_args() + nsys_output_base = self._build_output_path() + nsys_cmd = self._build_nsys_command(inner_args, nsys_output_base) + + print(f"Running nsys profile command:\n {shlex.join(nsys_cmd)}\n") + + proc = subprocess.Popen(nsys_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + for line in proc.stdout: + print(line, end="") + proc.wait() + + nsys_rep_path = self._find_nsys_report(nsys_output_base, proc.returncode) + self._compute_and_report_ratio(nsys_rep_path) + + def _build_inner_args(self): + """Reconstruct inner CLI args: strip --report_kernel_coverage, ensure --profile.""" + inner_args = [] + skip_next = False + for arg in sys.argv[1:]: + if skip_next: + skip_next = False + continue + if arg == "--report_kernel_coverage": + continue + inner_args.append(arg) + if "--profile" not in inner_args: + inner_args.append("--profile") + return inner_args + + def _build_output_path(self): + """Build the nsys output base path.""" + os.makedirs(self.log_dir, exist_ok=True) + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + return os.path.join(self.log_dir, f"nsys_{self.model_name}_{timestamp}") + + def _build_nsys_command(self, inner_args, output_base): + """Build the nsys profile command.""" + return [ + "nsys", + "profile", + "-c", + "cudaProfilerApi", + "--capture-range-end=stop-shutdown", + "-o", + output_base, + "--trace=cuda", + "--force-overwrite=true", + "--", + sys.executable, + os.path.abspath(__file__), + ] + inner_args + + def _find_nsys_report(self, output_base, returncode): + """Find the generated .nsys-rep file (nsys may add numeric suffixes).""" + pattern = f"{output_base}*.nsys-rep" + matches = sorted(glob.glob(pattern)) + + if returncode != 0 and not matches: + print(f"\nnsys profile exited with code {returncode} and no report was generated.") + sys.exit(returncode) + elif returncode != 0: + print(f"\nWarning: nsys exited with code {returncode}, but report was generated. Proceeding.") + + if not matches: + print(f"Error: No .nsys-rep file found matching {pattern}") + sys.exit(1) + + nsys_rep_path = matches[-1] + print(f"\nFound nsys report: {nsys_rep_path}") + return nsys_rep_path + + @staticmethod + def _resolve_sqlite_path(path): + """Resolve an .nsys-rep or .sqlite path to a .sqlite file path.""" + if path.endswith(".sqlite"): + if not os.path.isfile(path): + raise FileNotFoundError(f"SQLite file not found: {path}") + return path + + if path.endswith(".nsys-rep"): + if not os.path.isfile(path): + raise FileNotFoundError(f"nsys-rep file not found: {path}") + sibling = path.replace(".nsys-rep", ".sqlite") + if os.path.isfile(sibling): + return sibling + # Export using nsys CLI + output_path = sibling + try: + subprocess.run( + ["nsys", "export", "--type=sqlite", "-o", output_path, path], + check=True, + capture_output=True, + text=True, + ) + except FileNotFoundError: + raise RuntimeError("nsys CLI not found. Install NVIDIA Nsight Systems or provide a .sqlite file.") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"nsys export failed: {e.stderr}") + return output_path + + raise ValueError(f"Unsupported file type: {path} (expected .nsys-rep or .sqlite)") + + def _extract_kernel_durations(self, path): + """Extract GPU kernel durations from an NSight Systems report. + + Args: + path: Path to an .nsys-rep or .sqlite file. + + Returns: + Dict mapping (kernel_idx, kernel_name) to GPU duration in nanoseconds. + """ + sqlite_path = self._resolve_sqlite_path(path) + conn = sqlite3.connect(sqlite_path) + try: + cursor = conn.cursor() + cursor.execute( + """ + SELECT k.start, k.end, s.value AS name + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + ORDER BY k.start ASC + """ + ) + result = {} + for idx, (start, end, name) in enumerate(cursor): + duration_ns = float(end - start) + result[(idx, name)] = duration_ns + return result + finally: + conn.close() + + def _classify_kernels(self, durations): + """Classify kernel durations into TileGym and other categories.""" + tilegym_by_name = defaultdict(float) + tilegym_count_by_name = defaultdict(int) + other_by_name = defaultdict(float) + other_count_by_name = defaultdict(int) + for (_idx, name), dur_ns in durations.items(): + if self.kernel_filter.contains(name): + tilegym_by_name[name] += dur_ns + tilegym_count_by_name[name] += 1 + else: + other_by_name[name] += dur_ns + other_count_by_name[name] += 1 + return tilegym_by_name, tilegym_count_by_name, other_by_name, other_count_by_name + + def _compute_and_report_ratio(self, nsys_rep_path): + """Compute and print the TileGym kernel GPU time ratio from an nsys report.""" + durations = self._extract_kernel_durations(nsys_rep_path) + if not durations: + print("No kernel durations found in the nsys report.") + return + + tilegym_by_name, tilegym_count_by_name, other_by_name, other_count_by_name = self._classify_kernels(durations) + + tilegym_total_ns = sum(tilegym_by_name.values()) + other_total_ns = sum(other_by_name.values()) + all_total_ns = tilegym_total_ns + other_total_ns + tilegym_total_count = sum(tilegym_count_by_name.values()) + other_total_count = sum(other_count_by_name.values()) + all_total_count = tilegym_total_count + other_total_count + + self._print_report( + tilegym_by_name, + tilegym_count_by_name, + tilegym_total_ns, + tilegym_total_count, + all_total_ns, + all_total_count, + ) + + tilegym_time_pct = (tilegym_total_ns / all_total_ns * 100) if all_total_ns > 0 else 0 + tilegym_count_pct = (tilegym_total_count / all_total_count * 100) if all_total_count > 0 else 0 + time_ratio_str = f"{tilegym_time_pct:.1f}%" + count_ratio_str = f"{tilegym_count_pct:.1f}%" + + if self.args.summary_file: + self._append_summary(time_ratio_str, count_ratio_str) + + def _print_report( + self, + tilegym_by_name, + tilegym_count_by_name, + tilegym_total_ns, + tilegym_total_count, + all_total_ns, + all_total_count, + ): + """Print the formatted kernel coverage report table.""" + print("\n===== NSYS KERNEL GPU TIME ANALYSIS =====\n") + header_fmt = "{:<60} {:>8} {:>15} {:>12}" + row_fmt = "{:<60} {:>8} {:>15.3f} {:>11.1f}%" + sep = "-" * 60 + print(header_fmt.format("Kernel Name", "# Calls", "GPU Time (ms)", "% of Total")) + print(f"{sep} {'--------':>8} {'-------------':>15} {'----------':>10}") + + for name, dur_ns in sorted(tilegym_by_name.items(), key=lambda x: -x[1]): + dur_ms = dur_ns / 1e6 + pct = (dur_ns / all_total_ns * 100) if all_total_ns > 0 else 0 + count = tilegym_count_by_name[name] + print(row_fmt.format(name[:60], count, dur_ms, pct)) + + print(f"{sep} {'--------':>8} {'-------------':>15} {'----------':>10}") + tilegym_ms = tilegym_total_ns / 1e6 + all_ms = all_total_ns / 1e6 + tilegym_time_pct = (tilegym_total_ns / all_total_ns * 100) if all_total_ns > 0 else 0 + tilegym_count_pct = (tilegym_total_count / all_total_count * 100) if all_total_count > 0 else 0 + print(row_fmt.format("TileGym Total", tilegym_total_count, tilegym_ms, tilegym_time_pct)) + print(row_fmt.format("All Kernels Total", all_total_count, all_ms, 100.0)) + + time_ratio_str = f"{tilegym_time_pct:.1f}%" + count_ratio_str = f"{tilegym_count_pct:.1f}%" + print(f"\n>>> cuTile Kernel Coverage (GPU Time): {time_ratio_str} <<<") + print(f">>> cuTile Kernel Coverage (# Launches): {count_ratio_str} <<<\n") + + def _append_summary(self, time_ratio_str, count_ratio_str): + """Append coverage ratio to the summary file.""" + with open(self.args.summary_file, "a") as f: + f.write( + f"nsys_cutile_coverage | {self.model_name:<40} | time={time_ratio_str} | launches={count_ratio_str}\n" + ) + print(f"Coverage ratio appended to {self.args.summary_file}") + + def main(): args = parse_args() + if args.report_kernel_coverage: + NsysKernelCoverageReporter(args).run() + return + # Check if GPU is available device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") @@ -426,6 +676,13 @@ def main(): with record_function("model_inference"): with torch.no_grad(): _ = forward_wrapper.forward() + + # Also run a trace with cudaProfilerAPI. + with torch.no_grad(): + torch.cuda.cudart().cudaProfilerStart() + _ = forward_wrapper.forward() + torch.cuda.cudart().cudaProfilerStop() + # prof.export_chrome_trace("trace.json") filtered_results = [] kernel_filter = KernelFilter() diff --git a/src/tilegym/ops/moe_interface.py b/src/tilegym/ops/moe_interface.py index a99d306..b0a135e 100644 --- a/src/tilegym/ops/moe_interface.py +++ b/src/tilegym/ops/moe_interface.py @@ -23,7 +23,7 @@ def invoke_fused_moe_kernel(*args, **kwargs) -> None: ) def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: raise NotImplementedError(f"moe_align_block_size is not implemented for this backend: {get_current_backend()}")