NetherlandsForensicInstitute · akaIDIOT · Jan 21, 2026 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.github/compare-benchmarks.py b/.github/compare-benchmarks.py
@@ -0,0 +1,103 @@
+import json
+import subprocess
+from argparse import ArgumentParser
+from glob import iglob
+from itertools import chain, groupby
+from operator import itemgetter
+
+from tabulate import tabulate
+
+
+COMMENT_TEMPLATE = """
+<!-- compare-benchmarks.py -->
+
+Comparing *{stat}* ({better} is better) metric of benchmarks between this PR's target ({old}) and the HEAD of this PR ({new}):
+
+{table}
+
+*(This comment will be updated on subsequent pushes)*
+"""
+
+BETTER = {
+    'median': 'lower',
+    'ops': 'higher',
+}
+
+def compare_benchmarks(benchmarks, old, new):
+    for benchmark, python, by_commit in benchmarks:
+        result_old, result_new = by_commit[old], by_commit[new]
+        yield benchmark, python, (result_new - result_old) / result_old
+
+
+def combine_runs(runs, commits):
+    stats = sorted(chain.from_iterable(runs))
+    for benchmark, by_python in groupby(stats, key=itemgetter(0)):
+        for python, by_commit in groupby(by_python, key=itemgetter(1)):
+            by_commit = {commit: value for *_, commit, value in by_commit if commit in commits}
+            if len(by_commit) == len(commits):
+                yield benchmark, python, by_commit
+
+
+def read_run(run, stat='median'):
+    python_implementation = run['machine_info']['python_implementation']
+    python_version = '.'.join(run['machine_info']['python_version'].split('.')[:2])
+    if python_implementation != 'CPython':
+        python_version = f'{python_implementation} {python_version}'
+
+    commit = run['commit_info']['id']
+
+    for benchmark in run['benchmarks']:
+        yield benchmark['name'], python_version, commit, benchmark['stats'][stat]
+
+
+def loadf(f):
+    with open(f, 'r') as f:
+        return json.load(f)
+
+
+def to_table(benchmarks):
+    headers = None
+    table = []
+
+    for benchmark_name, by_python in groupby(benchmarks, key=itemgetter(0)):
+        by_python = {python: difference for *_, python, difference in by_python}
+        if not headers:
+            headers = ('', *by_python.keys())
+
+        table.append((benchmark_name, *by_python.values()))
+
+    return headers, table
+
+
+if __name__ == '__main__':
+    args = ArgumentParser()
+    args.add_argument('--old', metavar='REF', required=True)
+    args.add_argument('--new', metavar='REF', default='HEAD')
+    args.add_argument('--stat', default='median')
+    args.add_argument('--comment-file')
+
+    args = args.parse_args()
+
+    # pytest-benchmark will store full commit hashes, git rev-parse the old and new references to get the commit hashes
+    args.old = subprocess.check_output(('git', 'rev-parse', args.old), text=True).strip()
+    args.new = subprocess.check_output(('git', 'rev-parse', args.new), text=True).strip()
+
+    benchmarks = (read_run(loadf(f), stat=args.stat) for f in iglob('.benchmarks/*/*.json'))
+    benchmarks = combine_runs(benchmarks, commits={args.old, args.new})
+    benchmarks = compare_benchmarks(benchmarks, old=args.old, new=args.new)
+    headers, table = to_table(benchmarks)
+
+    table = tabulate(table, headers=headers, tablefmt='github', floatfmt='+.0%')
+    print(table)
+
+    if args.comment_file:
+        with open(args.comment_file, 'wt') as comment_file:
+            comment_file.write(
+                COMMENT_TEMPLATE.format(
+                    old=args.old,
+                    new=args.new,
+                    stat=args.stat,
+                    better=BETTER.get(args.stat, 'lower'),
+                    table=table,
+                )
+            )
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -6,6 +6,7 @@ on:
   pull_request:
 
 jobs:
+
   check:
     runs-on: ubuntu-latest
     steps:
@@ -19,13 +20,17 @@ jobs:
         run: pdm install --group check
       - name: Run checks
         run: pdm run check
+
   test:
     runs-on: ubuntu-latest
+    needs: check
     strategy:
       matrix:
         python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.10', 'pypy-3.11']
     steps:
       - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
       - uses: pdm-project/setup-pdm@v4
         with:
           python-version: ${{ matrix.python-version }}
@@ -38,10 +43,45 @@ jobs:
         run: pdm run test
       - name: Run benchmarks
         run: pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.sha }}'
+      - name: Run benchmarks for target
+        if: ${{ github.event.pull_request.base.sha }}
+        run: |
+          git switch --detach ${{ github.event.pull_request.base.sha }}
+          pdm run benchmark --benchmark-save '${{ matrix.python-version }}-${{ github.event.pull_request.base.sha }}'
+          git switch --detach -
       - name: Upload benchmark results
+        if: ${{ github.event.pull_request.base.sha }}
         uses: actions/upload-artifact@v4
         with:
-          name: 'benchmarks-${{ matrix.python-version }}-${{ github.sha }}'
+          name: 'benchmarks-${{ matrix.python-version }}'
           path: '.benchmarks/*/*.json'
           include-hidden-files: true
           retention-days: 1
+
+  compare-benchmarks:
+    runs-on: ubuntu-latest
+    needs: test
+    if: ${{ github.event.pull_request }}
+    steps:
+      - uses: actions/checkout@v5
+      - uses: pdm-project/setup-pdm@v4
+        with:
+          python-version: '3.14'
+          cache: true
+          cache-dependency-path: 'pylock.toml'
+      - name: Install test dependencies
+        run: pdm install --group test
+      - uses: actions/download-artifact@v5
+        with:
+          pattern: benchmarks-*
+          path: .benchmarks/
+          merge-multiple: true
+      - name: Compare benchmarks
+        run: pdm run python .github/compare-benchmarks.py --old ${{ github.event.pull_request.base.sha }} --new ${{ github.sha }} --comment-file ./BENCHMARK_COMMENT
+      - uses: edumserrano/find-create-or-update-comment@v3
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          body-includes: '<!-- compare-benchmarks.py -->'
+          comment-author: 'github-actions[bot]'
+          edit-mode: replace
+          body-path: './BENCHMARK_COMMENT'
diff --git a/pylock.toml b/pylock.toml
@@ -310,6 +310,19 @@ marker = "\"check\" in dependency_groups"
 [packages.tool.pdm]
 dependencies = []
 
+[[packages]]
+name = "tabulate"
+version = "0.9.0"
+requires-python = ">=3.7"
+sdist = {name = "tabulate-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hashes = {sha256 = "0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}}
+wheels = [
+    {name = "tabulate-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl",hashes = {sha256 = "024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}},
+]
+marker = "\"test\" in dependency_groups"
+
+[packages.tool.pdm]
+dependencies = []
+
 [[packages]]
 name = "types-pyyaml"
 version = "6.0.12.20250915"
@@ -590,7 +603,7 @@ marker = "\"test\" in dependency_groups"
 dependencies = []
 
 [tool.pdm]
-hashes = {sha256 = "4d844899bab0d12815b3587bacd58900cc01d8718caf0ba2d0274e0803136606"}
+hashes = {sha256 = "84bb8a9b1c809227e3051f501aa2ea8a09d588205a7c1204a4ddc2fb23b57550"}
 strategy = ["inherit_metadata", "static_urls"]
 
 [[tool.pdm.targets]]

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,9 +30,6 @@ classifiers = [
 [project.urls]
 homepage = "https://github.com/NetherlandsForensicInstitute/confidence/"
 
-[tool.pdm]
-version = {source = "scm"}
-
 [dependency-groups]
 check = [
     "mypy",
@@ -44,11 +41,23 @@ test = [
     "coverage",
     "pytest",
     "pytest-benchmark",
+    "tabulate",
 ]
 
+[tool.pdm]
+version = {source = "scm"}
+
 [tool.pdm.scripts]
 all = {composite = ["check", "test"]}
 benchmark = "pdm run test --benchmark-only --benchmark-autosave tests/"
+benchmark-against = {keep_going = true, composite = [
+    "git rev-parse {args}",
+    "pdm run benchmark",
+    "git switch --detach {args}",
+    "pdm run benchmark",
+    "git switch -",
+    "pdm run python .github/compare-benchmarks.py --old {args}"
+]}
 check = {composite = ["check-lock", "format", "lint", "type-check"]}
 check-lock = "pdm lock --check"
 format = "ruff format --diff confidence/ tests/"