Reed-CompBio · tristan-f-r · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -34,7 +34,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -282,7 +281,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -7,7 +7,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,9 +13,15 @@
 """
 
 import copy as copy
+import functools
+import hashlib
+import importlib.metadata
 import itertools as it
 import os
+import subprocess
+import tomllib
 import warnings
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -27,6 +33,59 @@
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the revision of the current SPRAS repository. This function is meant to be user-friendly to warn for bad SPRAS installs.
+    1. If this file is inside the correct `.git` repository, we use the revision hash. This is for development in SPRAS as well as SPRAS installs via a cloned git repository.
+    2. If SPRAS was installed via a PyPA-compliant package manager, we use the hash of the RECORD file (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file).
+        which contains the hashes of all installed files to the package.
+    """
+    clone_tip = "Make sure SPRAS is installed through the installation instructions: https://spras.readthedocs.io/en/latest/install.html."
+
+    # Check if we're inside the right git repository
+    try:
+        project_directory = subprocess.check_output(
+            ["git", "rev-parse", "--show-toplevel"],
+            encoding='utf-8',
+            # In case the CWD is not inside the actual SPRAS directory
+            cwd=Path(__file__).parent.resolve()
+        ).strip()
+
+        # We check the pyproject.toml name attribute to confirm that this is the SPRAS project. This is susceptible
+        # to false negatives, but we use this as a preliminary check against bad SPRAS installs.
+        pyproject_path = Path(project_directory, 'pyproject.toml')
+        try:
+            pyproject_toml = tomllib.loads(pyproject_path.read_text())
+            if "project" not in pyproject_toml or "name" not in pyproject_toml["project"]:
+                raise RuntimeError(f"The git top-level `{pyproject_path}` does not have the expected attributes. {clone_tip}")
+            if pyproject_toml["project"]["name"] != "spras":
+                raise RuntimeError(f"The git top-level `{pyproject_path}` is not the SPRAS pyproject.toml. {clone_tip}")
+
+            return subprocess.check_output(
+                ["git", "rev-parse", "--short", "HEAD"],
+                encoding='utf-8',
+                cwd=project_directory
+            ).strip()
+        except FileNotFoundError as err:
+            # pyproject.toml wasn't found during the `read_text` call
+            raise RuntimeError(f"The git top-level {pyproject_path} wasn't found. {clone_tip}") from err
+        except tomllib.TOMLDecodeError as err:
+            raise RuntimeError(f"The git top-level {pyproject_path} is malformed. {clone_tip}") from err
+    except subprocess.CalledProcessError:
+        try:
+            # `git` failed: use the truncated hash of the RECORD file in .dist-info instead.
+            record_path = str(importlib.metadata.distribution('spras').locate_file(f"spras-{importlib.metadata.version('spras')}.dist-info/RECORD"))
+            with open(record_path, 'rb', buffering=0) as f:
+                # Truncated to the magic value 8, the length of the short git revision.
+                return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+        except importlib.metadata.PackageNotFoundError as err:
+            # The metadata.distribution call failed.
+            raise RuntimeError(f"The spras package wasn't found: {clone_tip}") from err
+
+def attach_spras_revision(label: str) -> str:
+    return f"{label}_{spras_revision()}"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
@@ -115,6 +174,12 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(gold_standard.label)
+
         self.datasets = {}
         for dataset in raw_config.datasets:
             label = dataset.label
@@ -129,8 +194,11 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(attach_spras_revision, gold_standard["dataset_labels"])
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -186,7 +254,10 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    # Incorporates the `spras_revision` into the hash
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt
@@ -1,10 +1,4 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
-test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	0.0398936170212766	5	2.0	16	3.882808476926124	27	0	27	27	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	0.007295700506524384	469	6.0	6	2.7973618474338107	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	0.05291005291005291	4	1.0	5	1.306439393939394	28	1	27	28	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	0.04183535762483131	6	1.0	5	1.5084498834498834	39	1	38	39	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	0.0989010989010989	4	1.0	2	1.1866666666666668	14	0	14	14	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	0.0033669841848593955	32	1.0	30	6.72248989073389	531	1	530	531	1	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	0.002836867968446916	35	1.0	24	6.038766691954387	616	1	615	616	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
+25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+1874	12845	1	0.007319084148670001	469	6.0	6	2.7952001166950904	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt
@@ -1,13 +1,6 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
-test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{}
-test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'flow': 1, 'capacity': 1}
-test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 200}
-test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 100}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'flow': 1, 'capacity': 1}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 100}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 200}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{}
diff --git a/test/analysis/input/.gitignore b/test/analysis/input/.gitignore
@@ -0,0 +1 @@
+run
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml