Reed-CompBio · tristan-f-r · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 30, 2025
diff --git a/Snakefile b/Snakefile
@@ -2,10 +2,11 @@ import os
 from spras import runner
 import shutil
 import yaml
-from spras.dataset import Dataset
-from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, cytoscape
 import spras.config.config as _config
+from spras.dataset import Dataset
+from spras.evaluation import Evaluation
+from spras.statistics import from_output_pathway, statistics_computation, statistics_options
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
@@ -310,18 +311,33 @@ rule viz_cytoscape:
     run:
         cytoscape.run_cytoscape(input.pathways, output.session, container_settings)
 
+for keys, values in statistics_computation.items():
+    pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
+    rule:
+        name: pythonic_name
+        input: pathway_file = rules.parse_output.output.standardized_file
+        output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
+        run:
+            (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
+            graph = from_output_pathway(input.pathway_file)
+            for computed, output in zip(values(graph), output):
+                Path(output).write_text(str(computed))
 
 # Write a single summary table for all pathways for each dataset
 rule summary_table:
     input:
         # Collect all pathways generated for the dataset
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
-        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
+        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
+        # Collect all possible options
+        statistics = expand(
+            '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
+            out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
     output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt'])
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,13 +1,14 @@
+import ast
 from pathlib import Path
-from statistics import median
 from typing import Iterable
 
-import networkx as nx
 import pandas as pd
 
+from spras.statistics import from_output_pathway
+
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list, statistics_files: list) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
@@ -17,6 +18,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
     combinations.
     @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
+    @param statistics_files: a list of statistic files with the computed statistics.
     @return: pandas DataFrame with summary information
     """
     # Ensure that NODEID is the first column
@@ -39,52 +41,17 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
     # Iterate through each network file path
     for index, file_path in enumerate(sorted(file_paths)):
-        with open(file_path, 'r') as f:
-            lines = f.readlines()[1:]  # skip the header line
-
         # directed or mixed graphs are parsed and summarized as an undirected graph
-        nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
+        nw = from_output_pathway(file_path)
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
-        number_nodes = nw.number_of_nodes()
-        number_edges = nw.number_of_edges()
-        ncc = nx.number_connected_components(nw)
-
-        # Save the max/median degree, average clustering coefficient, and density
-        if number_nodes == 0:
-            max_degree = 0
-            median_degree = 0.0
-            density = 0.0
-        else:
-            degrees = [deg for _, deg in nw.degree()]
-            max_degree = max(degrees)
-            median_degree = median(degrees)
-            density = nx.density(nw)
-
-        cc = list(nx.connected_components(nw))
-        # Save the max diameter
-        # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
-        diameters = [
-            nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
-            for c in cc
-        ]
-        max_diameter = max(diameters, default=0)
-
-        # Save the average path lengths
-        # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
-        avg_path_lengths = [
-            nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
-            for c in cc
-        ]
-
-        if len(avg_path_lengths) != 0:
-            avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
-        else:
-            avg_path_len = 0.0
+
+        # We use literal_eval here to easily coerce to either ints or floats, depending.
+        graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
+        cur_nw_info = [nw_name, *graph_statistics]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -105,8 +72,10 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
+    # Get the list of statistic names by their file names
+    statistics_options = [Path(file).stem for file in statistics_files]
     # Prepare column names
-    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
+    col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 

diff --git a/spras/statistics.py b/spras/statistics.py
@@ -0,0 +1,70 @@
+"""
+Graph statistics, used to power summary.py.
+
+We allow for arbitrary computation of any specific statistic on some graph,
+computing more than necessary if we have dependencies. See the top level
+`statistics_computation` dictionary for usage.
+"""
+
+import itertools
+from statistics import median
+from typing import Callable
+
+import networkx as nx
+
+
+def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
+    """
+    Computes the (max, median) degree of a `graph`.
+    """
+    # number_of_nodes is a cheap call
+    if graph.number_of_nodes() == 0:
+        return (0, 0.0)
+    else:
+        degrees = [deg for _, deg in graph.degree()]
+        return max(degrees), median(degrees)
+
+def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
+    graph: nx.Graph = directed_graph.to_undirected()
+    cc = list(nx.connected_components(graph))
+    # Save the max diameter
+    # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
+    diameters = [
+        nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0
+        for c in cc
+    ]
+    max_diameter = max(diameters, default=0)
+
+    # Save the average path lengths
+    # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
+    avg_path_lengths = [
+        nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0
+        for c in cc
+    ]
+
+    if len(avg_path_lengths) != 0:
+        avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
+    else:
+        avg_path_len = 0.0
+
+    return max_diameter, avg_path_len
+
+# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
+statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
+    ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
+    ('Number of edges',): lambda graph : (graph.number_of_edges(),),
+    ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
+    ('Density',): lambda graph : (nx.density(graph),),
+
+    ('Max degree', 'Median degree'): compute_degree,
+    ('Max diameter', 'Average path length'): compute_on_cc,
+}
+
+# All of the keys inside statistics_computation, flattened.
+statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))
+
+def from_output_pathway(lines) -> nx.Graph:
+    with open(lines, 'r') as f:
+        lines = f.readlines()[1:]
+
+    return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))
diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py
@@ -12,9 +12,9 @@
 # - 'NODEID' is required as the first column label in the node table
 # - file_paths must be an iterable, even if a single file path is passed
 
-INPUT_DIR = 'test/analysis/input/'
-OUT_DIR = 'test/analysis/output/'
-EXPECT_DIR = 'test/analysis/expected_output/'
+INPUT_DIR = Path('test', 'analysis', 'input')
+OUT_DIR = Path('test', 'analysis', 'output')
+EXPECT_DIR = Path('test', 'analysis', 'expected_output')
 
 
 class TestSummary:
@@ -35,14 +35,14 @@ def test_example_networks(self):
                         }
         example_dataset = Dataset(example_dict)
         example_node_table = example_dataset.node_table
-        config.init_from_file(INPUT_DIR + "config.yaml")
+        config.init_from_file(INPUT_DIR / "config.yaml")
         algorithm_params = config.config.algorithm_params
         algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
-        example_network_files = Path(INPUT_DIR + "example").glob("*.txt")  # must be path to use .glob()
+        example_network_files = Path(INPUT_DIR, "example").glob("*.txt")
 
-        out_path = Path(OUT_DIR + "test_example_summary.txt")
+        out_path = Path(OUT_DIR, "test_example_summary.txt")
         out_path.unlink(missing_ok=True)
         summarize_example = summarize_networks(example_network_files, example_node_table, algorithm_params,
                                                algorithms_with_params)
@@ -51,7 +51,7 @@ def test_example_networks(self):
 
         # Comparing the dataframes directly with equals does not match because of how the parameter
         # combinations column is loaded from disk. Therefore, write both to disk and compare the files.
-        assert filecmp.cmp(out_path, EXPECT_DIR + "expected_example_summary.txt", shallow=False)
+        assert filecmp.cmp(out_path, EXPECT_DIR / "expected_example_summary.txt", shallow=False)
 
     def test_egfr_networks(self):
         """Test data from EGFR workflow"""
@@ -64,14 +64,14 @@ def test_egfr_networks(self):
 
         egfr_dataset = Dataset(egfr_dict)
         egfr_node_table = egfr_dataset.node_table
-        config.init_from_file(INPUT_DIR + "egfr.yaml")
+        config.init_from_file(INPUT_DIR / "egfr.yaml")
         algorithm_params = config.config.algorithm_params
         algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
-        egfr_network_files = Path(INPUT_DIR + "egfr").glob("*.txt")  # must be path to use .glob()
+        egfr_network_files = Path(INPUT_DIR, "egfr").glob("*.txt")  # must be path to use .glob()
 
-        out_path = Path(OUT_DIR + "test_egfr_summary.txt")
+        out_path = Path(OUT_DIR, "test_egfr_summary.txt")
         out_path.unlink(missing_ok=True)
         summarize_egfr = summarize_networks(egfr_network_files, egfr_node_table, algorithm_params,
                                             algorithms_with_params)
@@ -80,7 +80,7 @@ def test_egfr_networks(self):
 
         # Comparing the dataframes directly with equals does not match because of how the parameter
         # combinations column is loaded from disk. Therefore, write both to disk and compare the files.
-        assert filecmp.cmp(out_path, EXPECT_DIR + "expected_egfr_summary.txt", shallow=False)
+        assert filecmp.cmp(out_path, EXPECT_DIR / "expected_egfr_summary.txt", shallow=False)
 
     def test_load_dataset_dict(self):
         """Test loading files from dataset_dict"""
@@ -95,7 +95,7 @@ def test_load_dataset_dict(self):
 
         # node_table contents are not generated consistently in the same order,
         # so we will check that the contents are the same, but row order doesn't matter
-        expected_node_table = pd.read_csv((EXPECT_DIR + "expected_node_table.txt"), sep="\t")
+        expected_node_table = pd.read_csv((EXPECT_DIR / "expected_node_table.txt"), sep="\t")
 
         # ignore 'NODEID' column because this changes each time upon new generation
         cols_to_compare = [col for col in example_node_table.columns if col != "NODEID"]