From 9a7dbb7cbf94254b3dd8c62866483ead57692a66 Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Tue, 18 Nov 2025 14:23:05 +0100
Subject: [PATCH 01/10] Run benchmarks for pull request target/base commit

---
 .github/workflows/test.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a8bd436..26f742e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -26,6 +26,8 @@ jobs:
         python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.10', 'pypy-3.11']
     steps:
       - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
       - uses: pdm-project/setup-pdm@v4
         with:
           python-version: ${{ matrix.python-version }}
@@ -38,10 +40,17 @@ jobs:
         run: pdm run test
       - name: Run benchmarks
         run: pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.sha }}'
+      - name: Run benchmarks for target
+        if: ${{ github.event.pull_request.base.sha }}
+        run: |
+          git switch --detach ${{ github.event.pull_request.base.sha }}
+          pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.event.pull_request.base.sha }}'
+          git switch --detach -
       - name: Upload benchmark results
+        if: ${{ github.event.pull_request.base.sha }}
         uses: actions/upload-artifact@v4
         with:
-          name: 'benchmarks-${{ matrix.python-version }}-${{ github.sha }}'
+          name: 'benchmarks-${{ matrix.python-version }}'
           path: '.benchmarks/*/*.json'
           include-hidden-files: true
           retention-days: 1

From 29535c54f1266647ab96ef0bf4a6758401cab72c Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Tue, 18 Nov 2025 16:51:10 +0100
Subject: [PATCH 02/10] Add compare-benchmarks action step

---
 .github/workflows/test.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 26f742e..d3cb29b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -6,6 +6,7 @@ on:
   pull_request:
 
 jobs:
+
   check:
     runs-on: ubuntu-latest
     steps:
@@ -19,6 +20,7 @@ jobs:
         run: pdm install --group check
       - name: Run checks
         run: pdm run check
+
   test:
     runs-on: ubuntu-latest
     strategy:
@@ -54,3 +56,24 @@ jobs:
           path: '.benchmarks/*/*.json'
           include-hidden-files: true
           retention-days: 1
+
+  compare-benchmarks:
+    if: ${{ github.event.pull_request }}
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - run: echo ${{ github.event.pull_request }}
+      - uses: actions/checkout@v5
+      - uses: pdm-project/setup-pdm@v4
+        with:
+          python-version: '3.14'
+          cache: true
+          cache-dependency-path: 'pylock.toml'
+      - name: Install test dependencies
+        run: pdm install --group test
+      - uses: actions/download-artifact@v5
+        with:
+          pattern: benchmarks-*
+          path: .benchmarks/
+          merge-multiple: true
+      - run: ls -lR .benchmarks/

From 965320b52c6e4a4ba90bdc7071219198c5886b5a Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Tue, 18 Nov 2025 16:59:12 +0100
Subject: [PATCH 03/10] Add comparison script, call from compare-benchmarks
 step

---
 .github/compare-benchmarks.py | 68 +++++++++++++++++++++++++++++++++++
 .github/workflows/test.yml    |  5 ++-
 pylock.toml                   | 15 +++++++-
 pyproject.toml                |  7 ++--
 4 files changed, 88 insertions(+), 7 deletions(-)
 create mode 100644 .github/compare-benchmarks.py

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
new file mode 100644
index 0000000..030c95d
--- /dev/null
+++ b/.github/compare-benchmarks.py
@@ -0,0 +1,68 @@
+import json
+from argparse import ArgumentParser
+from glob import iglob
+from itertools import chain, groupby
+from operator import itemgetter
+
+from tabulate import tabulate
+
+
+def compare_benchmarks(benchmarks, old, new):
+    for benchmark, python, by_commit in benchmarks:
+        result_old, result_new = by_commit[old], by_commit[new]
+        yield benchmark, python, (result_new - result_old) / result_old
+
+
+def combine_runs(runs, commits):
+    stats = sorted(chain.from_iterable(runs))
+    for benchmark, by_python in groupby(stats, key=itemgetter(0)):
+        for python, by_commit in groupby(by_python, key=itemgetter(1)):
+            by_commit = {commit: value for *_, commit, value in by_commit if commit in commits}
+            if len(by_commit) == len(commits):
+                yield benchmark, python, by_commit
+
+
+def read_run(run, stat='median'):
+    python_implementation = run['machine_info']['python_implementation']
+    python_version = '.'.join(run['machine_info']['python_version'].split('.')[:2])
+    if python_implementation != 'CPython':
+        python_version = f'{python_implementation} {python_version}'
+
+    commit = run['commit_info']['id']
+
+    for benchmark in run['benchmarks']:
+        yield benchmark['name'], python_version, commit, benchmark['stats'][stat]
+
+
+def loadf(f):
+    with open(f, 'r') as f:
+        return json.load(f)
+
+
+def to_table(benchmarks):
+    headers = None
+    table = []
+
+    for benchmark_name, by_python in groupby(benchmarks, key=itemgetter(0)):
+        by_python = {python: difference for *_, python, difference in by_python}
+        if not headers:
+            headers = ('', *by_python.keys())
+
+        table.append((benchmark_name, *by_python.values()))
+
+    return headers, table
+
+
+if __name__ == '__main__':
+    args = ArgumentParser()
+    args.add_argument('--old', metavar='COMMIT', required=True)
+    args.add_argument('--new', metavar='COMMIT', required=True)
+    args.add_argument('--stat', default='median')
+
+    args = args.parse_args()
+
+    benchmarks = (read_run(loadf(f), stat=args.stat) for f in iglob('.benchmarks/*/*.json'))
+    benchmarks = combine_runs(benchmarks, commits={args.old, args.new})
+    benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new)
+    headers, table = to_table(benchmarks)
+    print(tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%'))
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d3cb29b..0bc1d98 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -58,11 +58,10 @@ jobs:
           retention-days: 1
 
   compare-benchmarks:
-    if: ${{ github.event.pull_request }}
     runs-on: ubuntu-latest
     needs: test
+    if: ${{ github.event.pull_request }}
     steps:
-      - run: echo ${{ github.event.pull_request }}
       - uses: actions/checkout@v5
       - uses: pdm-project/setup-pdm@v4
         with:
@@ -76,4 +75,4 @@ jobs:
           pattern: benchmarks-*
           path: .benchmarks/
           merge-multiple: true
-      - run: ls -lR .benchmarks/
+      - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }}
diff --git a/pylock.toml b/pylock.toml
index 7da8eb6..667c5a4 100644
--- a/pylock.toml
+++ b/pylock.toml
@@ -310,6 +310,19 @@ marker = "\"check\" in dependency_groups"
 [packages.tool.pdm]
 dependencies = []
 
+[[packages]]
+name = "tabulate"
+version = "0.9.0"
+requires-python = ">=3.7"
+sdist = {name = "tabulate-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hashes = {sha256 = "0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}}
+wheels = [
+    {name = "tabulate-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl",hashes = {sha256 = "024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}},
+]
+marker = "\"test\" in dependency_groups"
+
+[packages.tool.pdm]
+dependencies = []
+
 [[packages]]
 name = "types-pyyaml"
 version = "6.0.12.20250915"
@@ -590,7 +603,7 @@ marker = "\"test\" in dependency_groups"
 dependencies = []
 
 [tool.pdm]
-hashes = {sha256 = "4d844899bab0d12815b3587bacd58900cc01d8718caf0ba2d0274e0803136606"}
+hashes = {sha256 = "84bb8a9b1c809227e3051f501aa2ea8a09d588205a7c1204a4ddc2fb23b57550"}
 strategy = ["inherit_metadata", "static_urls"]
 
 [[tool.pdm.targets]]
diff --git a/pyproject.toml b/pyproject.toml
index 5a904e3..4b523c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,9 +30,6 @@ classifiers = [
 [project.urls]
 homepage = "https://github.com/NetherlandsForensicInstitute/confidence/"
 
-[tool.pdm]
-version = {source = "scm"}
-
 [dependency-groups]
 check = [
     "mypy",
@@ -44,8 +41,12 @@ test = [
     "coverage",
     "pytest",
     "pytest-benchmark",
+    "tabulate",
 ]
 
+[tool.pdm]
+version = {source = "scm"}
+
 [tool.pdm.scripts]
 all = {composite = ["check", "test"]}
 benchmark = "pdm run test --benchmark-only --benchmark-autosave tests/"

From 432f615142d5999b2009358ce146b10c7beb1a1b Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:18:55 +0100
Subject: [PATCH 04/10] Store comparison results in comment file

---
 .github/compare-benchmarks.py | 15 ++++++++++++++-
 .github/workflows/test.yml    |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index 030c95d..5296c74 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -7,6 +7,12 @@
 from tabulate import tabulate
 
 
+COMMENT_TEMPLATE = """
+Comparing *{stat}* metric between **base** {old} and **proposed** {new}:
+
+{table}
+"""
+
 def compare_benchmarks(benchmarks, old, new):
     for benchmark, python, by_commit in benchmarks:
         result_old, result_new = by_commit[old], by_commit[new]
@@ -58,6 +64,7 @@ def to_table(benchmarks):
     args.add_argument('--old', metavar='COMMIT', required=True)
     args.add_argument('--new', metavar='COMMIT', required=True)
     args.add_argument('--stat', default='median')
+    args.add_argument('--comment-file')
 
     args = args.parse_args()
 
@@ -65,4 +72,10 @@ def to_table(benchmarks):
     benchmarks = combine_runs(benchmarks, commits={args.old, args.new})
     benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new)
     headers, table = to_table(benchmarks)
-    print(tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%'))
+
+    table = tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%')
+    print(table)
+
+    if args.comment_file:
+        with open(args.comment_file, 'wt') as comment_file:
+            comment_file.write(COMMENT_TEMPLATE.format(old=args.old, new=args.new, stat=args.stat, table=table))
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0bc1d98..367c27b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -75,4 +75,4 @@ jobs:
           pattern: benchmarks-*
           path: .benchmarks/
           merge-multiple: true
-      - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }}
+      - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT

From 4ecf12230e7e285812b38d6f8562ea2ad4def1a6 Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:22:12 +0100
Subject: [PATCH 05/10] Create or update comment with pull request benchmark
 comparison

---
 .github/compare-benchmarks.py |  2 ++
 .github/workflows/test.yml    | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index 5296c74..116870f 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -8,6 +8,8 @@
 
 
 COMMENT_TEMPLATE = """
+<!-- compare-benchmarks.py -->
+
 Comparing *{stat}* metric between **base** {old} and **proposed** {new}:
 
 {table}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 367c27b..3f6c3bf 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -75,4 +75,12 @@ jobs:
           pattern: benchmarks-*
           path: .benchmarks/
           merge-multiple: true
-      - run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT
+      - name: Compare benchmarks
+        run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT
+      - uses: edumserrano/find-create-or-update-comment@v3
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          body-includes: '<!-- compare-benchmarks.py -->'
+          comment-author: 'github-actions[bot]'
+          edit-mode: replace
+          body-path: './BENCHMARK_COMMENT'

From 4d442b34039e088ed17814847c6a5652e71f553d Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:31:50 +0100
Subject: [PATCH 06/10] Mention the word "benchmarks" in the benchmarks
 comparison comment

---
 .github/compare-benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index 116870f..147f0ad 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -10,7 +10,7 @@
 COMMENT_TEMPLATE = """
 <!-- compare-benchmarks.py -->
 
-Comparing *{stat}* metric between **base** {old} and **proposed** {new}:
+Comparing *{stat}* metric of benchmarks between **base** {old} and **proposed** {new}:
 
 {table}
 """

From add3e3a4358211626459ba7dbe7ec0c19ad6ce6b Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:32:35 +0100
Subject: [PATCH 07/10] Run check before test to avoid running test matrix on
 check violation

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3f6c3bf..9cecb7a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,6 +23,7 @@ jobs:
 
   test:
     runs-on: ubuntu-latest
+    needs: check
     strategy:
       matrix:
         python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.10', 'pypy-3.11']

From c6a19015b5bcc2999644db927522a3b71f64e339 Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:37:15 +0100
Subject: [PATCH 08/10] Improve comment template to include update notice and
 "lower is better"

---
 .github/compare-benchmarks.py | 19 +++++++++++++++++--
 .github/workflows/test.yml    |  2 +-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index 147f0ad..6ee02fd 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -10,11 +10,18 @@
 COMMENT_TEMPLATE = """
 <!-- compare-benchmarks.py -->
 
-Comparing *{stat}* metric of benchmarks between **base** {old} and **proposed** {new}:
+Comparing *{stat}* ({better} is better) metric of benchmarks between this PR's target ({old}) and the HEAD of this PR ({new}):
 
 {table}
+
+This comment will be updated on subsequent pushes.
 """
 
+BETTER = {
+    'median': 'lower',
+    'ops': 'higher',
+}
+
 def compare_benchmarks(benchmarks, old, new):
     for benchmark, python, by_commit in benchmarks:
         result_old, result_new = by_commit[old], by_commit[new]
@@ -80,4 +87,12 @@ def to_table(benchmarks):
 
     if args.comment_file:
         with open(args.comment_file, 'wt') as comment_file:
-            comment_file.write(COMMENT_TEMPLATE.format(old=args.old, new=args.new, stat=args.stat, table=table))
+            comment_file.write(
+                COMMENT_TEMPLATE.format(
+                    old=args.old,
+                    new=args.new,
+                    stat=args.stat,
+                    better=BETTER.get(args.stat, 'lower'),
+                    table=table,
+                )
+            )
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9cecb7a..6dfb052 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -77,7 +77,7 @@ jobs:
           path: .benchmarks/
           merge-multiple: true
       - name: Compare benchmarks
-        run: pdm run .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment ./BENCHMARK_COMMENT
+        run: pdm run python .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment-file ./BENCHMARK_COMMENT
       - uses: edumserrano/find-create-or-update-comment@v3
         with:
           issue-number: ${{ github.event.pull_request.number }}

From 9c79c9bdd22dd2e350540e1b7df173aaa22192ca Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Wed, 7 Jan 2026 11:06:40 +0100
Subject: [PATCH 09/10] Enable running benchmark twice for different commits
 locally

---
 .github/compare-benchmarks.py | 2 +-
 pyproject.toml                | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index 6ee02fd..cdde217 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -14,7 +14,7 @@
 
 {table}
 
-This comment will be updated on subsequent pushes.
+*(This comment will be updated on subsequent pushes)*
 """
 
 BETTER = {
diff --git a/pyproject.toml b/pyproject.toml
index 4b523c9..3d4c3b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,13 @@ version = {source = "scm"}
 [tool.pdm.scripts]
 all = {composite = ["check", "test"]}
 benchmark = "pdm run test --benchmark-only --benchmark-autosave tests/"
+benchmark-against = {keep_going = true, composite = [
+    "git rev-parse {args}",
+    "pdm run benchmark",
+    "git switch --detach {args}",
+    "pdm run benchmark",
+    "git switch -",
+]}
 check = {composite = ["check-lock", "format", "lint", "type-check"]}
 check-lock = "pdm lock --check"
 format = "ruff format --diff confidence/ tests/"

From 67c5ad84c520bc887ba7e50f80228c15be11329e Mon Sep 17 00:00:00 2001
From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com>
Date: Wed, 7 Jan 2026 11:44:04 +0100
Subject: [PATCH 10/10] Treat old and new comparison arguments as git refs

---
 .github/compare-benchmarks.py | 9 +++++++--
 pyproject.toml                | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
index cdde217..5bab1f8 100644
--- a/.github/compare-benchmarks.py
+++ b/.github/compare-benchmarks.py
@@ -1,4 +1,5 @@
 import json
+import subprocess
 from argparse import ArgumentParser
 from glob import iglob
 from itertools import chain, groupby
@@ -70,13 +71,17 @@ def to_table(benchmarks):
 
 if __name__ == '__main__':
     args = ArgumentParser()
-    args.add_argument('--old', metavar='COMMIT', required=True)
-    args.add_argument('--new', metavar='COMMIT', required=True)
+    args.add_argument('--old', metavar='REF', required=True)
+    args.add_argument('--new', metavar='REF', default='HEAD')
     args.add_argument('--stat', default='median')
     args.add_argument('--comment-file')
 
     args = args.parse_args()
 
+    # pytest-benchmark will store full commit hashes, git rev-parse the old and new references to get the commit hashes
+    args.old = subprocess.check_output(('git', 'rev-parse', args.old), text=True).strip()
+    args.new = subprocess.check_output(('git', 'rev-parse', args.new), text=True).strip()
+
     benchmarks = (read_run(loadf(f), stat=args.stat) for f in iglob('.benchmarks/*/*.json'))
     benchmarks = combine_runs(benchmarks, commits={args.old, args.new})
     benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new)
diff --git a/pyproject.toml b/pyproject.toml
index 3d4c3b7..6ead834 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ benchmark-against = {keep_going = true, composite = [
     "git switch --detach {args}",
     "pdm run benchmark",
     "git switch -",
+    "pdm run python .github/compare-benchmarks.py --old {args}"
 ]}
 check = {composite = ["check-lock", "format", "lint", "type-check"]}
 check-lock = "pdm lock --check"