Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ import os
from spras import runner
import shutil
import yaml
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.analysis import ml, summary, cytoscape
import spras.config.config as _config
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.statistics import from_output_pathway, statistics_computation, statistics_options

# Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
# and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
Expand Down Expand Up @@ -310,18 +311,33 @@ rule viz_cytoscape:
run:
cytoscape.run_cytoscape(input.pathways, output.session, container_settings)

for keys, values in statistics_computation.items():
pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
rule:
name: pythonic_name
input: pathway_file = rules.parse_output.output.standardized_file
output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
run:
(Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
graph = from_output_pathway(input.pathway_file)
for computed, output in zip(values(graph), output):
Path(output).write_text(str(computed))

# Write a single summary table for all pathways for each dataset
rule summary_table:
input:
# Collect all pathways generated for the dataset
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
# Collect all possible options
statistics = expand(
'{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt'])
run:
# Load the node table from the pickled dataset file
node_table = Dataset.from_file(input.dataset_file).node_table
summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics)
summary_df.to_csv(output.summary_table, sep='\t', index=False)

# Cluster the output pathways for each dataset
Expand Down
57 changes: 13 additions & 44 deletions spras/analysis/summary.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import ast
from pathlib import Path
from statistics import median
from typing import Iterable

import networkx as nx
import pandas as pd

from spras.statistics import from_output_pathway


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
algo_with_params: list) -> pd.DataFrame:
algo_with_params: list, statistics_files: list) -> pd.DataFrame:
"""
Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
Expand All @@ -17,6 +18,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
@param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
combinations.
@param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
@param statistics_files: a list of statistic files with the computed statistics.
@return: pandas DataFrame with summary information
"""
# Ensure that NODEID is the first column
Expand All @@ -39,52 +41,17 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg

# Iterate through each network file path
for index, file_path in enumerate(sorted(file_paths)):
with open(file_path, 'r') as f:
lines = f.readlines()[1:] # skip the header line

# directed or mixed graphs are parsed and summarized as an undirected graph
nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
nw = from_output_pathway(file_path)

# Save the network name, number of nodes, number edges, and number of connected components
nw_name = str(file_path)
number_nodes = nw.number_of_nodes()
number_edges = nw.number_of_edges()
ncc = nx.number_connected_components(nw)

# Save the max/median degree, average clustering coefficient, and density
if number_nodes == 0:
max_degree = 0
median_degree = 0.0
density = 0.0
else:
degrees = [deg for _, deg in nw.degree()]
max_degree = max(degrees)
median_degree = median(degrees)
density = nx.density(nw)

cc = list(nx.connected_components(nw))
# Save the max diameter
# Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
diameters = [
nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
for c in cc
]
max_diameter = max(diameters, default=0)

# Save the average path lengths
# Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
avg_path_lengths = [
nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
for c in cc
]

if len(avg_path_lengths) != 0:
avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
else:
avg_path_len = 0.0

# We use literal_eval here to easily coerce to either ints or floats, depending.
graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]

# Initialize list to store current network information
cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
cur_nw_info = [nw_name, *graph_statistics]

# Iterate through each node property and save the intersection with the current network
for node_list in nodes_by_col:
Expand All @@ -105,8 +72,10 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
# Save the current network information to the network summary list
nw_info.append(cur_nw_info)

# Get the list of statistic names by their file names
statistics_options = [Path(file).stem for file in statistics_files]
# Prepare column names
col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
col_names = ['Name', *statistics_options]
col_names.extend(nodes_by_col_labs)
col_names.append('Parameter combination')

Expand Down
70 changes: 70 additions & 0 deletions spras/statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Graph statistics, used to power summary.py.

We allow for arbitrary computation of any specific statistic on some graph,
computing more than necessary if we have dependencies. See the top level
`statistics_computation` dictionary for usage.
"""

import itertools
from statistics import median
from typing import Callable

import networkx as nx


def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
"""
Computes the (max, median) degree of a `graph`.
"""
# number_of_nodes is a cheap call
if graph.number_of_nodes() == 0:
return (0, 0.0)
else:
degrees = [deg for _, deg in graph.degree()]
return max(degrees), median(degrees)

def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
graph: nx.Graph = directed_graph.to_undirected()
cc = list(nx.connected_components(graph))
# Save the max diameter
# Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
diameters = [
nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0
for c in cc
]
max_diameter = max(diameters, default=0)

# Save the average path lengths
# Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
avg_path_lengths = [
nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0
for c in cc
]

if len(avg_path_lengths) != 0:
avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
else:
avg_path_len = 0.0

return max_diameter, avg_path_len

# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
('Number of edges',): lambda graph : (graph.number_of_edges(),),
('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
('Density',): lambda graph : (nx.density(graph),),

('Max degree', 'Median degree'): compute_degree,
('Max diameter', 'Average path length'): compute_on_cc,
}

# All of the keys inside statistics_computation, flattened.
statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))

def from_output_pathway(lines) -> nx.Graph:
with open(lines, 'r') as f:
lines = f.readlines()[1:]

return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))
24 changes: 12 additions & 12 deletions test/analysis/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# - 'NODEID' is required as the first column label in the node table
# - file_paths must be an iterable, even if a single file path is passed

INPUT_DIR = 'test/analysis/input/'
OUT_DIR = 'test/analysis/output/'
EXPECT_DIR = 'test/analysis/expected_output/'
INPUT_DIR = Path('test', 'analysis', 'input')
OUT_DIR = Path('test', 'analysis', 'output')
EXPECT_DIR = Path('test', 'analysis', 'expected_output')


class TestSummary:
Expand All @@ -35,14 +35,14 @@ def test_example_networks(self):
}
example_dataset = Dataset(example_dict)
example_node_table = example_dataset.node_table
config.init_from_file(INPUT_DIR + "config.yaml")
config.init_from_file(INPUT_DIR / "config.yaml")
algorithm_params = config.config.algorithm_params
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
algorithm_params.items() for params_hash in param_combos.keys()]

example_network_files = Path(INPUT_DIR + "example").glob("*.txt") # must be path to use .glob()
example_network_files = Path(INPUT_DIR, "example").glob("*.txt")

out_path = Path(OUT_DIR + "test_example_summary.txt")
out_path = Path(OUT_DIR, "test_example_summary.txt")
out_path.unlink(missing_ok=True)
summarize_example = summarize_networks(example_network_files, example_node_table, algorithm_params,
algorithms_with_params)
Expand All @@ -51,7 +51,7 @@ def test_example_networks(self):

# Comparing the dataframes directly with equals does not match because of how the parameter
# combinations column is loaded from disk. Therefore, write both to disk and compare the files.
assert filecmp.cmp(out_path, EXPECT_DIR + "expected_example_summary.txt", shallow=False)
assert filecmp.cmp(out_path, EXPECT_DIR / "expected_example_summary.txt", shallow=False)

def test_egfr_networks(self):
"""Test data from EGFR workflow"""
Expand All @@ -64,14 +64,14 @@ def test_egfr_networks(self):

egfr_dataset = Dataset(egfr_dict)
egfr_node_table = egfr_dataset.node_table
config.init_from_file(INPUT_DIR + "egfr.yaml")
config.init_from_file(INPUT_DIR / "egfr.yaml")
algorithm_params = config.config.algorithm_params
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
algorithm_params.items() for params_hash in param_combos.keys()]

egfr_network_files = Path(INPUT_DIR + "egfr").glob("*.txt") # must be path to use .glob()
egfr_network_files = Path(INPUT_DIR, "egfr").glob("*.txt") # must be path to use .glob()

out_path = Path(OUT_DIR + "test_egfr_summary.txt")
out_path = Path(OUT_DIR, "test_egfr_summary.txt")
out_path.unlink(missing_ok=True)
summarize_egfr = summarize_networks(egfr_network_files, egfr_node_table, algorithm_params,
algorithms_with_params)
Expand All @@ -80,7 +80,7 @@ def test_egfr_networks(self):

# Comparing the dataframes directly with equals does not match because of how the parameter
# combinations column is loaded from disk. Therefore, write both to disk and compare the files.
assert filecmp.cmp(out_path, EXPECT_DIR + "expected_egfr_summary.txt", shallow=False)
assert filecmp.cmp(out_path, EXPECT_DIR / "expected_egfr_summary.txt", shallow=False)

def test_load_dataset_dict(self):
"""Test loading files from dataset_dict"""
Expand All @@ -95,7 +95,7 @@ def test_load_dataset_dict(self):

# node_table contents are not generated consistently in the same order,
# so we will check that the contents are the same, but row order doesn't matter
expected_node_table = pd.read_csv((EXPECT_DIR + "expected_node_table.txt"), sep="\t")
expected_node_table = pd.read_csv((EXPECT_DIR / "expected_node_table.txt"), sep="\t")

# ignore 'NODEID' column because this changes each time upon new generation
cols_to_compare = [col for col in example_node_table.columns if col != "NODEID"]
Expand Down
Loading