From 6ec4f6237970a9cbf4cb8defd42ab8620b6d7c92 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 10 Oct 2025 06:32:06 +0000 Subject: [PATCH 01/11] refactor: separate statistic computation we also make it lazy --- spras/analysis/summary.py | 44 +++----------------- spras/statistics.py | 88 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 38 deletions(-) create mode 100644 spras/statistics.py diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index c8abc1cad..fd70db8f3 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -1,10 +1,11 @@ from pathlib import Path -from statistics import median from typing import Iterable import networkx as nx import pandas as pd +from spras.statistics import compute_statistics, statistics_options + def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], algo_with_params: list) -> pd.DataFrame: @@ -47,44 +48,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) - number_nodes = nw.number_of_nodes() - number_edges = nw.number_of_edges() - ncc = nx.number_connected_components(nw) - - # Save the max/median degree, average clustering coefficient, and density - if number_nodes == 0: - max_degree = 0 - median_degree = 0.0 - density = 0.0 - else: - degrees = [deg for _, deg in nw.degree()] - max_degree = max(degrees) - median_degree = median(degrees) - density = nx.density(nw) - - cc = list(nx.connected_components(nw)) - # Save the max diameter - # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) - diameters = [ - nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0 - for c in cc - ] - max_diameter = max(diameters, default=0) - - # Save the average path lengths - # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) - avg_path_lengths = [ - nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0 - for c in cc - ] - - if len(avg_path_lengths) != 0: - avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) - else: - avg_path_len = 0.0 + + graph_statistics = compute_statistics(nw, statistics_options) # Initialize list to store current network information - cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len] + cur_nw_info = [nw_name, *graph_statistics] # Iterate through each node property and save the intersection with the current network for node_list in nodes_by_col: @@ -104,7 +72,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg nw_info.append(cur_nw_info) # Prepare column names - col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length'] + col_names = ['Name', *statistics_options] col_names.extend(nodes_by_col_labs) col_names.append('Parameter combination') diff --git a/spras/statistics.py b/spras/statistics.py new file mode 100644 index 000000000..843e5292a --- /dev/null +++ b/spras/statistics.py @@ -0,0 +1,88 @@ +""" +Graph statistics, used to power summary.py. + +We allow for arbitrary computation of any specific statistic on some graph, +computing more than necessary if we have dependencies. See the top level +`statistics_computation` dictionary for usage. +""" + +import itertools +import networkx as nx +from statistics import median +from typing import Callable + +def compute_degree(graph: nx.DiGraph) -> tuple[int, float]: + """ + Computes the (max, median) degree of a `graph`. + """ + # number_of_nodes is a cheap call + if graph.number_of_nodes() == 0: + return (0, 0.0) + else: + degrees = [deg for _, deg in graph.degree()] + return max(degrees), median(degrees) + +def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: + cc = list(nx.connected_components(graph)) + # Save the max diameter + # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) + diameters = [ + nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0 + for c in cc + ] + max_diameter = max(diameters, default=0) + + # Save the average path lengths + # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) + avg_path_lengths = [ + nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0 + for c in cc + ] + + if len(avg_path_lengths) != 0: + avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) + else: + avg_path_len = 0.0 + + return max_diameter, avg_path_len + +# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs. +statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = { + ('Number of nodes',): lambda graph : (graph.number_of_nodes(),), + ('Number of edges',): lambda graph : (graph.number_of_edges(),), + ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),), + ('Density',): lambda graph : (nx.density(graph),), + + ('Max degree', 'Median degree'): compute_degree, + ('Max diameter', 'Average path length'): compute_on_cc, +} + +# All of the keys inside statistics_computation, flattened. +statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys()))) + +def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]: + """ + Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary + in this file. + """ + + # early-scan cutoff for statistics: + # we want to err as soon as possible + for stat in statistics: + if stat not in statistics_options: + raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}") + + # now, we can compute statistics only + computed_statistics: dict[str, float | int] = dict() + for statistic_tuple, compute in statistics_computation.items(): + # when we want them + if not set(statistic_tuple).isdisjoint(set(statistics)): + computed_tuple = compute(graph) + assert len(statistic_tuple) == computed_tuple, f"bad tuple length for {statistic_tuple}" + + current_computed_statistics = zip(statistic_tuple, computed_tuple) + for stat, value in current_computed_statistics: + computed_statistics[stat] = value + + # (and return only the statistics we wanted) + return {key: computed_statistics[key] for key in statistics} From 9987189d8e0d9a9006ae1897cd44836500a5c906 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 10 Oct 2025 06:48:54 +0000 Subject: [PATCH 02/11] fix: correct tuple assumption --- spras/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/statistics.py b/spras/statistics.py index 843e5292a..ac91b80a9 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -78,7 +78,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl # when we want them if not set(statistic_tuple).isdisjoint(set(statistics)): computed_tuple = compute(graph) - assert len(statistic_tuple) == computed_tuple, f"bad tuple length for {statistic_tuple}" + assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}" current_computed_statistics = zip(statistic_tuple, computed_tuple) for stat, value in current_computed_statistics: From 25eef5e72aee4fb7aea6f6b5e9d11dff7fd5be16 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 10 Oct 2025 07:06:46 +0000 Subject: [PATCH 03/11] fix: stably use graph statistic values --- spras/analysis/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index fd70db8f3..432dba0a4 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -52,7 +52,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg graph_statistics = compute_statistics(nw, statistics_options) # Initialize list to store current network information - cur_nw_info = [nw_name, *graph_statistics] + cur_nw_info = [nw_name, *graph_statistics.values()] # Iterate through each node property and save the intersection with the current network for node_list in nodes_by_col: From cb373c130760c7040b16ec03ba1d2673e343465b Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 17:56:22 -0700 Subject: [PATCH 04/11] style: fmt --- spras/config/config.py | 4 ++-- spras/statistics.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 22e655941..add815d9d 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -71,7 +71,7 @@ def __init__(self, raw_config: dict[str, Any]): self.container_prefix: str = DEFAULT_CONTAINER_PREFIX # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False - # A Boolean indiciating whether to enable container runtime profiling (apptainer/singularity only) + # A Boolean indicating whether to enable container runtime profiling (apptainer/singularity only) self.enable_profiling = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None @@ -308,7 +308,7 @@ def process_config(self, raw_config: RawConfig): if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "": self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner - if raw_config.enable_profiling and not raw_config.container_framework in ["singularity", "apptainer"]: + if raw_config.enable_profiling and raw_config.container_framework not in ["singularity", "apptainer"]: warnings.warn("enable_profiling is set to true, but the container framework is not singularity/apptainer. This setting will have no effect.") self.enable_profiling = raw_config.enable_profiling diff --git a/spras/statistics.py b/spras/statistics.py index ac91b80a9..49ae8b3fc 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -7,10 +7,12 @@ """ import itertools -import networkx as nx from statistics import median from typing import Callable +import networkx as nx + + def compute_degree(graph: nx.DiGraph) -> tuple[int, float]: """ Computes the (max, median) degree of a `graph`. @@ -43,7 +45,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) else: avg_path_len = 0.0 - + return max_diameter, avg_path_len # The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs. @@ -52,7 +54,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: ('Number of edges',): lambda graph : (graph.number_of_edges(),), ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),), ('Density',): lambda graph : (nx.density(graph),), - + ('Max degree', 'Median degree'): compute_degree, ('Max diameter', 'Average path length'): compute_on_cc, } @@ -63,7 +65,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]: """ Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary - in this file. + in this file. """ # early-scan cutoff for statistics: @@ -71,7 +73,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl for stat in statistics: if stat not in statistics_options: raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}") - + # now, we can compute statistics only computed_statistics: dict[str, float | int] = dict() for statistic_tuple, compute in statistics_computation.items(): From 898d568a49053467d74af1cb952bdceac400436d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 18:15:23 -0700 Subject: [PATCH 05/11] style: specify zip strict --- spras/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/statistics.py b/spras/statistics.py index 49ae8b3fc..1ebe7cc62 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -82,7 +82,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl computed_tuple = compute(graph) assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}" - current_computed_statistics = zip(statistic_tuple, computed_tuple) + current_computed_statistics = zip(statistic_tuple, computed_tuple, strict=True) for stat, value in current_computed_statistics: computed_statistics[stat] = value From c675eced3b62b8a62204d9f6105628e1cdc09045 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 6 Nov 2025 02:22:45 +0000 Subject: [PATCH 06/11] fix: make undirected for determining number of connected components --- spras/statistics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spras/statistics.py b/spras/statistics.py index 1ebe7cc62..222051d23 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -24,7 +24,8 @@ def compute_degree(graph: nx.DiGraph) -> tuple[int, float]: degrees = [deg for _, deg in graph.degree()] return max(degrees), median(degrees) -def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: +def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]: + graph: nx.Graph = directed_graph.to_undirected() cc = list(nx.connected_components(graph)) # Save the max diameter # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) @@ -52,7 +53,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]: statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = { ('Number of nodes',): lambda graph : (graph.number_of_nodes(),), ('Number of edges',): lambda graph : (graph.number_of_edges(),), - ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),), + ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),), ('Density',): lambda graph : (nx.density(graph),), ('Max degree', 'Median degree'): compute_degree, From 1ca730e4cd36e0542fbd90496d972997db340d19 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 12:13:21 -0800 Subject: [PATCH 07/11] feat: snakemake-based summary generation --- Snakefile | 24 ++++++++++++++++++++---- spras/analysis/summary.py | 15 +++++++++------ spras/statistics.py | 28 ++-------------------------- 3 files changed, 31 insertions(+), 36 deletions(-) diff --git a/Snakefile b/Snakefile index cf075b0fa..060696c71 100644 --- a/Snakefile +++ b/Snakefile @@ -2,10 +2,11 @@ import os from spras import runner import shutil import yaml -from spras.dataset import Dataset -from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape import spras.config.config as _config +from spras.dataset import Dataset +from spras.evaluation import Evaluation +from spras.statistics import from_edgelist, statistics_computation, statistics_options # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -310,18 +311,33 @@ rule viz_cytoscape: run: cytoscape.run_cytoscape(input.pathways, output.session, container_settings) +for keys, values in statistics_computation.items(): + pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys]) + rule: + name: pythonic_name + input: pathway_file = rules.reconstruct.output.pathway_file + output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys] + run: + (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True) + graph = from_edgelist(input.pathway_file) + for computed, output in zip(values(graph), output): + Path(output).write_text(str(computed)) # Write a single summary table for all pathways for each dataset rule summary_table: input: # Collect all pathways generated for the dataset pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params), - dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) + dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']), + # Collect all possible options + statistics = expand( + '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt', + out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options) output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt']) run: # Load the node table from the pickled dataset file node_table = Dataset.from_file(input.dataset_file).node_table - summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params) + summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics) summary_df.to_csv(output.summary_table, sep='\t', index=False) # Cluster the output pathways for each dataset diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index cdffe0f68..0bd025aa4 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -1,14 +1,14 @@ +import ast from pathlib import Path from typing import Iterable -import networkx as nx import pandas as pd -from spras.statistics import compute_statistics, statistics_options +from spras.statistics import from_edgelist def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], - algo_with_params: list) -> pd.DataFrame: + algo_with_params: list, statistics_files: list) -> pd.DataFrame: """ Generate a table that aggregates summary information about networks in file_paths, including which nodes are present in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the @@ -44,15 +44,16 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg lines = f.readlines()[1:] # skip the header line # directed or mixed graphs are parsed and summarized as an undirected graph - nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) + nw = from_edgelist(lines) # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) - graph_statistics = compute_statistics(nw, statistics_options) + # We use literal_eval here to easily coerce to either ints or floats, depending. + graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files] # Initialize list to store current network information - cur_nw_info = [nw_name, *graph_statistics.values()] + cur_nw_info = [nw_name, *graph_statistics] # Iterate through each node property and save the intersection with the current network for node_list in nodes_by_col: @@ -73,6 +74,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Save the current network information to the network summary list nw_info.append(cur_nw_info) + # Get the list of statistic names by their file names + statistics_options = [Path(file).stem for file in statistics_files] # Prepare column names col_names = ['Name', *statistics_options] col_names.extend(nodes_by_col_labs) diff --git a/spras/statistics.py b/spras/statistics.py index 222051d23..7bc8253c6 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -63,29 +63,5 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]: # All of the keys inside statistics_computation, flattened. statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys()))) -def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]: - """ - Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary - in this file. - """ - - # early-scan cutoff for statistics: - # we want to err as soon as possible - for stat in statistics: - if stat not in statistics_options: - raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}") - - # now, we can compute statistics only - computed_statistics: dict[str, float | int] = dict() - for statistic_tuple, compute in statistics_computation.items(): - # when we want them - if not set(statistic_tuple).isdisjoint(set(statistics)): - computed_tuple = compute(graph) - assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}" - - current_computed_statistics = zip(statistic_tuple, computed_tuple, strict=True) - for stat, value in current_computed_statistics: - computed_statistics[stat] = value - - # (and return only the statistics we wanted) - return {key: computed_statistics[key] for key in statistics} +def from_edgelist(lines) -> nx.Graph: + return nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) From d67186dcd5679c44264b24836d86f25816aecb52 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 12:19:43 -0800 Subject: [PATCH 08/11] fix(Snakefile): use parse_output for edgelist parsing --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 060696c71..532be6fe7 100644 --- a/Snakefile +++ b/Snakefile @@ -315,7 +315,7 @@ for keys, values in statistics_computation.items(): pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys]) rule: name: pythonic_name - input: pathway_file = rules.reconstruct.output.pathway_file + input: pathway_file = rules.parse_output.output.standardized_file output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys] run: (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True) From fd483c3af9ab15bb5b1717b6a33b1ae338b25472 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 12:37:46 -0800 Subject: [PATCH 09/11] fix: parse edgelist with rank, embed header skip inside from_edgelist this had incorrect behavior ? --- Snakefile | 4 ++-- spras/analysis/summary.py | 7 ++----- spras/statistics.py | 7 +++++-- test/analysis/test_summary.py | 24 ++++++++++++------------ 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Snakefile b/Snakefile index 532be6fe7..9673b80b0 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,7 @@ from spras.analysis import ml, summary, cytoscape import spras.config.config as _config from spras.dataset import Dataset from spras.evaluation import Evaluation -from spras.statistics import from_edgelist, statistics_computation, statistics_options +from spras.statistics import from_output_pathway, statistics_computation, statistics_options # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -319,7 +319,7 @@ for keys, values in statistics_computation.items(): output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys] run: (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True) - graph = from_edgelist(input.pathway_file) + graph = from_output_pathway(input.pathway_file) for computed, output in zip(values(graph), output): Path(output).write_text(str(computed)) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 0bd025aa4..1f627493f 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -4,7 +4,7 @@ import pandas as pd -from spras.statistics import from_edgelist +from spras.statistics import from_output_pathway def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], @@ -40,11 +40,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Iterate through each network file path for index, file_path in enumerate(sorted(file_paths)): - with open(file_path, 'r') as f: - lines = f.readlines()[1:] # skip the header line - # directed or mixed graphs are parsed and summarized as an undirected graph - nw = from_edgelist(lines) + nw = from_output_pathway(file_path) # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) diff --git a/spras/statistics.py b/spras/statistics.py index 7bc8253c6..5399da390 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -63,5 +63,8 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]: # All of the keys inside statistics_computation, flattened. statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys()))) -def from_edgelist(lines) -> nx.Graph: - return nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) +def from_output_pathway(lines) -> nx.Graph: + with open(lines, 'r') as f: + lines = f.readlines()[1:] + + return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str))) diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py index 57f1f6012..8618f0a2f 100644 --- a/test/analysis/test_summary.py +++ b/test/analysis/test_summary.py @@ -12,9 +12,9 @@ # - 'NODEID' is required as the first column label in the node table # - file_paths must be an iterable, even if a single file path is passed -INPUT_DIR = 'test/analysis/input/' -OUT_DIR = 'test/analysis/output/' -EXPECT_DIR = 'test/analysis/expected_output/' +INPUT_DIR = Path('test', 'analysis', 'input') +OUT_DIR = Path('test', 'analysis', 'output') +EXPECT_DIR = Path('test', 'analysis', 'expected_output') class TestSummary: @@ -35,14 +35,14 @@ def test_example_networks(self): } example_dataset = Dataset(example_dict) example_node_table = example_dataset.node_table - config.init_from_file(INPUT_DIR + "config.yaml") + config.init_from_file(INPUT_DIR / "config.yaml") algorithm_params = config.config.algorithm_params algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] - example_network_files = Path(INPUT_DIR + "example").glob("*.txt") # must be path to use .glob() + example_network_files = Path(INPUT_DIR, "example").glob("*.txt") - out_path = Path(OUT_DIR + "test_example_summary.txt") + out_path = Path(OUT_DIR, "test_example_summary.txt") out_path.unlink(missing_ok=True) summarize_example = summarize_networks(example_network_files, example_node_table, algorithm_params, algorithms_with_params) @@ -51,7 +51,7 @@ def test_example_networks(self): # Comparing the dataframes directly with equals does not match because of how the parameter # combinations column is loaded from disk. Therefore, write both to disk and compare the files. - assert filecmp.cmp(out_path, EXPECT_DIR + "expected_example_summary.txt", shallow=False) + assert filecmp.cmp(out_path, EXPECT_DIR / "expected_example_summary.txt", shallow=False) def test_egfr_networks(self): """Test data from EGFR workflow""" @@ -64,14 +64,14 @@ def test_egfr_networks(self): egfr_dataset = Dataset(egfr_dict) egfr_node_table = egfr_dataset.node_table - config.init_from_file(INPUT_DIR + "egfr.yaml") + config.init_from_file(INPUT_DIR / "egfr.yaml") algorithm_params = config.config.algorithm_params algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] - egfr_network_files = Path(INPUT_DIR + "egfr").glob("*.txt") # must be path to use .glob() + egfr_network_files = Path(INPUT_DIR, "egfr").glob("*.txt") # must be path to use .glob() - out_path = Path(OUT_DIR + "test_egfr_summary.txt") + out_path = Path(OUT_DIR, "test_egfr_summary.txt") out_path.unlink(missing_ok=True) summarize_egfr = summarize_networks(egfr_network_files, egfr_node_table, algorithm_params, algorithms_with_params) @@ -80,7 +80,7 @@ def test_egfr_networks(self): # Comparing the dataframes directly with equals does not match because of how the parameter # combinations column is loaded from disk. Therefore, write both to disk and compare the files. - assert filecmp.cmp(out_path, EXPECT_DIR + "expected_egfr_summary.txt", shallow=False) + assert filecmp.cmp(out_path, EXPECT_DIR / "expected_egfr_summary.txt", shallow=False) def test_load_dataset_dict(self): """Test loading files from dataset_dict""" @@ -95,7 +95,7 @@ def test_load_dataset_dict(self): # node_table contents are not generated consistently in the same order, # so we will check that the contents are the same, but row order doesn't matter - expected_node_table = pd.read_csv((EXPECT_DIR + "expected_node_table.txt"), sep="\t") + expected_node_table = pd.read_csv((EXPECT_DIR / "expected_node_table.txt"), sep="\t") # ignore 'NODEID' column because this changes each time upon new generation cols_to_compare = [col for col in example_node_table.columns if col != "NODEID"] From fd5046f165f3ab29e6e154f29f4eab7316a0fb45 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 12:55:38 -0800 Subject: [PATCH 10/11] style: fmt --- spras/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/statistics.py b/spras/statistics.py index 5399da390..342f1a5e2 100644 --- a/spras/statistics.py +++ b/spras/statistics.py @@ -66,5 +66,5 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]: def from_output_pathway(lines) -> nx.Graph: with open(lines, 'r') as f: lines = f.readlines()[1:] - + return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str))) From 79cf748b9efe78dff51e69963591ef267a3eb0c8 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 13:17:48 -0800 Subject: [PATCH 11/11] chore: mention statistics_files param --- spras/analysis/summary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 1f627493f..e5c0b1f73 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -18,6 +18,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @param algo_with_params: a list of -params- combinations + @param statistics_files: a list of statistic files with the computed statistics. @return: pandas DataFrame with summary information """ # Ensure that NODEID is the first column