From 9a7dbb7cbf94254b3dd8c62866483ead57692a66 Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Tue, 18 Nov 2025 14:23:05 +0100 Subject: [PATCH 01/10] Run benchmarks for pull request target/base commit --- .github/workflows/test.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a8bd436..26f742e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,6 +26,8 @@ jobs: python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.10', 'pypy-3.11'] steps: - uses: actions/checkout@v5 + with: + fetch-depth: 0 - uses: pdm-project/setup-pdm@v4 with: python-version: ${{ matrix.python-version }} @@ -38,10 +40,17 @@ jobs: run: pdm run test - name: Run benchmarks run: pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.sha }}' + - name: Run benchmarks for target + if: ${{ github.event.pull_request.base.sha }} + run: | + git switch --detach ${{ github.event.pull_request.base.sha }} + pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.event.pull_request.base.sha }}' + git switch --detach - - name: Upload benchmark results + if: ${{ github.event.pull_request.base.sha }} uses: actions/upload-artifact@v4 with: - name: 'benchmarks-${{ matrix.python-version }}-${{ github.sha }}' + name: 'benchmarks-${{ matrix.python-version }}' path: '.benchmarks/*/*.json' include-hidden-files: true retention-days: 1 From 29535c54f1266647ab96ef0bf4a6758401cab72c Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:51:10 +0100 Subject: [PATCH 02/10] Add compare-benchmarks action step --- .github/workflows/test.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 26f742e..d3cb29b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,6 +6,7 @@ on: pull_request: jobs: + check: runs-on: ubuntu-latest steps: @@ -19,6 +20,7 @@ jobs: run: pdm install --group check - name: Run checks run: pdm run check + test: runs-on: ubuntu-latest strategy: @@ -54,3 +56,24 @@ jobs: path: '.benchmarks/*/*.json' include-hidden-files: true retention-days: 1 + + compare-benchmarks: + if: ${{ github.event.pull_request }} + runs-on: ubuntu-latest + needs: test + steps: + - run: echo ${{ github.event.pull_request }} + - uses: actions/checkout@v5 + - uses: pdm-project/setup-pdm@v4 + with: + python-version: '3.14' + cache: true + cache-dependency-path: 'pylock.toml' + - name: Install test dependencies + run: pdm install --group test + - uses: actions/download-artifact@v5 + with: + pattern: benchmarks-* + path: .benchmarks/ + merge-multiple: true + - run: ls -lR .benchmarks/ From 965320b52c6e4a4ba90bdc7071219198c5886b5a Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:59:12 +0100 Subject: [PATCH 03/10] Add comparison script, call from compare-benchmarks step --- .github/compare-benchmarks.py | 68 +++++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 5 ++- pylock.toml | 15 +++++++- pyproject.toml | 7 ++-- 4 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 .github/compare-benchmarks.py diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py new file mode 100644 index 0000000..030c95d --- /dev/null +++ b/.github/compare-benchmarks.py @@ -0,0 +1,68 @@ +import json +from argparse import ArgumentParser +from glob import iglob +from itertools import chain, groupby +from operator import itemgetter + +from tabulate import tabulate + + +def compare_benchmarks(benchmarks, old, new): + for benchmark, python, by_commit in benchmarks: + result_old, result_new = by_commit[old], by_commit[new] + yield benchmark, python, (result_new - result_old) / result_old + + +def combine_runs(runs, commits): + stats = sorted(chain.from_iterable(runs)) + for benchmark, by_python in groupby(stats, key=itemgetter(0)): + for python, by_commit in groupby(by_python, key=itemgetter(1)): + by_commit = {commit: value for *_, commit, value in by_commit if commit in commits} + if len(by_commit) == len(commits): + yield benchmark, python, by_commit + + +def read_run(run, stat='median'): + python_implementation = run['machine_info']['python_implementation'] + python_version = '.'.join(run['machine_info']['python_version'].split('.')[:2]) + if python_implementation != 'CPython': + python_version = f'{python_implementation} {python_version}' + + commit = run['commit_info']['id'] + + for benchmark in run['benchmarks']: + yield benchmark['name'], python_version, commit, benchmark['stats'][stat] + + +def loadf(f): + with open(f, 'r') as f: + return json.load(f) + + +def to_table(benchmarks): + headers = None + table = [] + + for benchmark_name, by_python in groupby(benchmarks, key=itemgetter(0)): + by_python = {python: difference for *_, python, difference in by_python} + if not headers: + headers = ('', *by_python.keys()) + + table.append((benchmark_name, *by_python.values())) + + return headers, table + + +if __name__ == '__main__': + args = ArgumentParser() + args.add_argument('--old', metavar='COMMIT', required=True) + args.add_argument('--new', metavar='COMMIT', required=True) + args.add_argument('--stat', default='median') + + args = args.parse_args() + + benchmarks = (read_run(loadf(f), stat=args.stat) for f in iglob('.benchmarks/*/*.json')) + benchmarks = combine_runs(benchmarks, commits={args.old, args.new}) + benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new) + headers, table = to_table(benchmarks) + print(tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%')) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d3cb29b..0bc1d98 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,11 +58,10 @@ jobs: retention-days: 1 compare-benchmarks: - if: ${{ github.event.pull_request }} runs-on: ubuntu-latest needs: test + if: ${{ github.event.pull_request }} steps: - - run: echo ${{ github.event.pull_request }} - uses: actions/checkout@v5 - uses: pdm-project/setup-pdm@v4 with: @@ -76,4 +75,4 @@ jobs: pattern: benchmarks-* path: .benchmarks/ merge-multiple: true - - run: ls -lR .benchmarks/ + - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} diff --git a/pylock.toml b/pylock.toml index 7da8eb6..667c5a4 100644 --- a/pylock.toml +++ b/pylock.toml @@ -310,6 +310,19 @@ marker = "\"check\" in dependency_groups" [packages.tool.pdm] dependencies = [] +[[packages]] +name = "tabulate" +version = "0.9.0" +requires-python = ">=3.7" +sdist = {name = "tabulate-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hashes = {sha256 = "0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}} +wheels = [ + {name = "tabulate-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl",hashes = {sha256 = "024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}}, +] +marker = "\"test\" in dependency_groups" + +[packages.tool.pdm] +dependencies = [] + [[packages]] name = "types-pyyaml" version = "6.0.12.20250915" @@ -590,7 +603,7 @@ marker = "\"test\" in dependency_groups" dependencies = [] [tool.pdm] -hashes = {sha256 = "4d844899bab0d12815b3587bacd58900cc01d8718caf0ba2d0274e0803136606"} +hashes = {sha256 = "84bb8a9b1c809227e3051f501aa2ea8a09d588205a7c1204a4ddc2fb23b57550"} strategy = ["inherit_metadata", "static_urls"] [[tool.pdm.targets]] diff --git a/pyproject.toml b/pyproject.toml index 5a904e3..4b523c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,6 @@ classifiers = [ [project.urls] homepage = "https://github.com/NetherlandsForensicInstitute/confidence/" -[tool.pdm] -version = {source = "scm"} - [dependency-groups] check = [ "mypy", @@ -44,8 +41,12 @@ test = [ "coverage", "pytest", "pytest-benchmark", + "tabulate", ] +[tool.pdm] +version = {source = "scm"} + [tool.pdm.scripts] all = {composite = ["check", "test"]} benchmark = "pdm run test --benchmark-only --benchmark-autosave tests/" From 432f615142d5999b2009358ce146b10c7beb1a1b Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Thu, 20 Nov 2025 22:18:55 +0100 Subject: [PATCH 04/10] Store comparison results in comment file --- .github/compare-benchmarks.py | 15 ++++++++++++++- .github/workflows/test.yml | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index 030c95d..5296c74 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -7,6 +7,12 @@ from tabulate import tabulate +COMMENT_TEMPLATE = """ +Comparing *{stat}* metric between **base** {old} and **proposed** {new}: + +{table} +""" + def compare_benchmarks(benchmarks, old, new): for benchmark, python, by_commit in benchmarks: result_old, result_new = by_commit[old], by_commit[new] @@ -58,6 +64,7 @@ def to_table(benchmarks): args.add_argument('--old', metavar='COMMIT', required=True) args.add_argument('--new', metavar='COMMIT', required=True) args.add_argument('--stat', default='median') + args.add_argument('--comment-file') args = args.parse_args() @@ -65,4 +72,10 @@ def to_table(benchmarks): benchmarks = combine_runs(benchmarks, commits={args.old, args.new}) benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new) headers, table = to_table(benchmarks) - print(tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%')) + + table = tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%') + print(table) + + if args.comment_file: + with open(args.comment_file, 'wt') as comment_file: + comment_file.write(COMMENT_TEMPLATE.format(old=args.old, new=args.new, stat=args.stat, table=table)) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0bc1d98..367c27b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -75,4 +75,4 @@ jobs: pattern: benchmarks-* path: .benchmarks/ merge-multiple: true - - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} + - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT From 4ecf12230e7e285812b38d6f8562ea2ad4def1a6 Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:22:12 +0100 Subject: [PATCH 05/10] Create or update comment with pull request benchmark comparison --- .github/compare-benchmarks.py | 2 ++ .github/workflows/test.yml | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index 5296c74..116870f 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -8,6 +8,8 @@ COMMENT_TEMPLATE = """ + + Comparing *{stat}* metric between **base** {old} and **proposed** {new}: {table} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 367c27b..3f6c3bf 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -75,4 +75,12 @@ jobs: pattern: benchmarks-* path: .benchmarks/ merge-multiple: true - - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT + - name: Compare benchmarks + run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT + - uses: edumserrano/find-create-or-update-comment@v3 + with: + issue-number: ${{ github.event.pull_request.number }} + body-includes: '' + comment-author: 'github-actions[bot]' + edit-mode: replace + body-path: './BENCHMARK_COMMENT' From 4d442b34039e088ed17814847c6a5652e71f553d Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:31:50 +0100 Subject: [PATCH 06/10] Mention the word "benchmarks" in the benchmarks comparison comment --- .github/compare-benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index 116870f..147f0ad 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -10,7 +10,7 @@ COMMENT_TEMPLATE = """ -Comparing *{stat}* metric between **base** {old} and **proposed** {new}: +Comparing *{stat}* metric of benchmarks between **base** {old} and **proposed** {new}: {table} """ From add3e3a4358211626459ba7dbe7ec0c19ad6ce6b Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:32:35 +0100 Subject: [PATCH 07/10] Run check before test to avoid running test matrix on check violation --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3f6c3bf..9cecb7a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,7 @@ jobs: test: runs-on: ubuntu-latest + needs: check strategy: matrix: python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.10', 'pypy-3.11'] From c6a19015b5bcc2999644db927522a3b71f64e339 Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:37:15 +0100 Subject: [PATCH 08/10] Improve comment template to include update notice and "lower is better" --- .github/compare-benchmarks.py | 19 +++++++++++++++++-- .github/workflows/test.yml | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index 147f0ad..6ee02fd 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -10,11 +10,18 @@ COMMENT_TEMPLATE = """ -Comparing *{stat}* metric of benchmarks between **base** {old} and **proposed** {new}: +Comparing *{stat}* ({better} is better) metric of benchmarks between this PR's target ({old}) and the HEAD of this PR ({new}): {table} + +This comment will be updated on subsequent pushes. """ +BETTER = { + 'median': 'lower', + 'ops': 'higher', +} + def compare_benchmarks(benchmarks, old, new): for benchmark, python, by_commit in benchmarks: result_old, result_new = by_commit[old], by_commit[new] @@ -80,4 +87,12 @@ def to_table(benchmarks): if args.comment_file: with open(args.comment_file, 'wt') as comment_file: - comment_file.write(COMMENT_TEMPLATE.format(old=args.old, new=args.new, stat=args.stat, table=table)) + comment_file.write( + COMMENT_TEMPLATE.format( + old=args.old, + new=args.new, + stat=args.stat, + better=BETTER.get(args.stat, 'lower'), + table=table, + ) + ) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9cecb7a..6dfb052 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -77,7 +77,7 @@ jobs: path: .benchmarks/ merge-multiple: true - name: Compare benchmarks - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT + run: pdm run python .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment-file ./BENCHMARK_COMMENT - uses: edumserrano/find-create-or-update-comment@v3 with: issue-number: ${{ github.event.pull_request.number }} From 9c79c9bdd22dd2e350540e1b7df173aaa22192ca Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Wed, 7 Jan 2026 11:06:40 +0100 Subject: [PATCH 09/10] Enable running benchmark twice for different commits locally --- .github/compare-benchmarks.py | 2 +- pyproject.toml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index 6ee02fd..cdde217 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -14,7 +14,7 @@ {table} -This comment will be updated on subsequent pushes. +*(This comment will be updated on subsequent pushes)* """ BETTER = { diff --git a/pyproject.toml b/pyproject.toml index 4b523c9..3d4c3b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,13 @@ version = {source = "scm"} [tool.pdm.scripts] all = {composite = ["check", "test"]} benchmark = "pdm run test --benchmark-only --benchmark-autosave tests/" +benchmark-against = {keep_going = true, composite = [ + "git rev-parse {args}", + "pdm run benchmark", + "git switch --detach {args}", + "pdm run benchmark", + "git switch -", +]} check = {composite = ["check-lock", "format", "lint", "type-check"]} check-lock = "pdm lock --check" format = "ruff format --diff confidence/ tests/" From 67c5ad84c520bc887ba7e50f80228c15be11329e Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Wed, 7 Jan 2026 11:44:04 +0100 Subject: [PATCH 10/10] Treat old and new comparison arguments as git refs --- .github/compare-benchmarks.py | 9 +++++++-- pyproject.toml | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py index cdde217..5bab1f8 100644 --- a/.github/compare-benchmarks.py +++ b/.github/compare-benchmarks.py @@ -1,4 +1,5 @@ import json +import subprocess from argparse import ArgumentParser from glob import iglob from itertools import chain, groupby @@ -70,13 +71,17 @@ def to_table(benchmarks): if __name__ == '__main__': args = ArgumentParser() - args.add_argument('--old', metavar='COMMIT', required=True) - args.add_argument('--new', metavar='COMMIT', required=True) + args.add_argument('--old', metavar='REF', required=True) + args.add_argument('--new', metavar='REF', default='HEAD') args.add_argument('--stat', default='median') args.add_argument('--comment-file') args = args.parse_args() + # pytest-benchmark will store full commit hashes, git rev-parse the old and new references to get the commit hashes + args.old = subprocess.check_output(('git', 'rev-parse', args.old), text=True).strip() + args.new = subprocess.check_output(('git', 'rev-parse', args.new), text=True).strip() + benchmarks = (read_run(loadf(f), stat=args.stat) for f in iglob('.benchmarks/*/*.json')) benchmarks = combine_runs(benchmarks, commits={args.old, args.new}) benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new) diff --git a/pyproject.toml b/pyproject.toml index 3d4c3b7..6ead834 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ benchmark-against = {keep_going = true, composite = [ "git switch --detach {args}", "pdm run benchmark", "git switch -", + "pdm run python .github/compare-benchmarks.py --old {args}" ]} check = {composite = ["check-lock", "format", "lint", "type-check"]} check-lock = "pdm lock --check"