Reed-CompBio · agitter · Oct 13, 2025 · Aug 12, 2025 · Aug 13, 2025 · Aug 18, 2025
diff --git a/docs/_static/config/beginner.yaml b/docs/_static/config/beginner.yaml
@@ -0,0 +1,56 @@
+hash_length: 7
+container_framework: docker
+unpack_singularity: false
+container_registry:
+  base_url: docker.io
+  owner: reedcompbio
+
+# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
+# which algorithms are run in a given experiment.
+#
+# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
+# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
+# combinations. For instance if we have the following:
+# - name: "myAlg"
+#   params:
+#         include: true
+#         a: [1,2]
+#         b: [0.5,0.75]
+#
+# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
+# careful: too many parameters might make your runs take a long time.
+
+algorithms:
+  - name: "pathlinker"
+    params:
+      include: true
+      run1:
+        k: 1
+        # run2: # uncomment for step 3.2
+        #   k: [10, 100] # uncomment for step 3.2
+
+# Here we specify which pathways to run and other file location information.
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets:
+  - # Labels can only contain letters, numbers, or underscores
+    label: egfr
+    node_files: ["tps-egfr-prizes.txt"] # the input nodes
+    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] # the interactome
+    # # Placeholder
+    other_files: []
+    # Relative path from the spras repository root directory where these files live
+    data_dir: "input"
+
+reconstruction_settings:
+
+  # Set where everything is saved
+  locations:
+    reconstruction_dir: "output/basic"
+
+analysis:
+  # Create one summary per pathway file and a single summary table for all pathways for each dataset
+  summary:
+    include: false # set to true for step 3.3
+  # Create Cytoscape session file with all pathway graphs for each dataset
+  cytoscape:
+    include: false # set to true for step 3.3
diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml
@@ -0,0 +1,132 @@
+hash_length: 7
+container_framework: docker
+unpack_singularity: false
+container_registry:
+  base_url: docker.io
+  owner: reedcompbio
+
+# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
+# which algorithms are run in a given experiment.
+#
+# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
+# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
+# combinations. For instance if we have the following:
+# - name: "myAlg"
+#   params:
+#         include: true
+#         a: [1,2]
+#         b: [0.5,0.75]
+#
+# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
+# careful: too many parameters might make your runs take a long time.
+
+algorithms:
+  - name: "pathlinker"
+    params:
+      include: true
+      run1:
+        k: 1
+      run2:
+        k: [10, 100]
+  - name: omicsintegrator1
+    params:
+      include: true
+      run1:
+        b: [0.55, 2, 10]
+        d: 10
+        g: 1e-3
+        r: 0.01
+        w: 0.1
+        mu: 0.008
+  - name: omicsintegrator2
+    params:
+      include: true
+      run1:
+        b: 4
+        g: 0
+      run2:
+        b: 2
+        g: 3
+  - name: meo
+    params:
+      include: true
+      run1:
+        local_search: ["Yes", "No"]
+        max_path_length: [2, 3]
+        rand_restarts: 10
+  - name: allpairs
+    params:
+      include: true
+  - name: domino
+    params:
+      include: true
+      run1:
+        slice_threshold: 0.3
+        module_threshold: 0.05
+  - name: mincostflow
+    params:
+      include: true
+      run1:
+        capacity: 15
+        flow: 80
+      run2:
+        capacity: 1
+        flow: 6
+      run3:
+        capacity: 5
+        flow: 60
+  - name: "strwr"
+    params:
+      include: true
+      run1:
+        alpha: [0.85]
+        threshold: [100, 200]
+  - name: "rwr"
+    params:
+      include: true
+      run1:
+        alpha: [0.85]
+        threshold: [100, 200]
+
+# Here we specify which pathways to run and other file location information.
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets: # TODO update this based on the dataset that I set up
+  - # Labels can only contain letters, numbers, or underscores
+    label: egfr
+    node_files: ["tps-egfr-prizes.txt"] # the input nodes
+    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] # the interactome
+    # Placeholder
+    other_files: []
+    # Relative path from the spras directory where these files live
+    data_dir: "input"
+
+reconstruction_settings:
+
+  # Set where everything is saved
+  locations:
+    reconstruction_dir: "output/intermediate"
+
+analysis:
+  # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
+  ml:
+    # ml analysis per dataset
+    include: false # set to true for step 3
+    # adds ml analysis per algorithm output
+    # only runs for algorithms with multiple parameter combinations chosen
+    aggregate_per_algorithm: false
+    # specify how many principal components to calculate
+    components: 2
+    # boolean to show the labels on the pca graph
+    labels: true
+    # 'ward', 'complete', 'average', 'single'
+    # if linkage: ward, must use metric: euclidean
+    linkage: 'ward'
+    # 'euclidean', 'manhattan', 'cosine'
+    metric: 'euclidean'
+    # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots.
+    # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file.
+    # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used
+    # to pick the 'best' parameter combination.
+    kde: false
+    # removes empty pathways from consideration in ml analysis (pca only)
+    remove_empty_pathways: false
diff --git a/docs/_static/images/100_pathway.png b/docs/_static/images/100_pathway.png
diff --git a/docs/_static/images/10_pathway.png b/docs/_static/images/10_pathway.png
diff --git a/docs/_static/images/1_pathway.png b/docs/_static/images/1_pathway.png
diff --git a/docs/_static/images/cytoscape-open-cys-file.png b/docs/_static/images/cytoscape-open-cys-file.png
diff --git a/docs/_static/images/cytoscape-opened.png b/docs/_static/images/cytoscape-opened.png
diff --git a/docs/_static/images/cytoscape_upload_network.png b/docs/_static/images/cytoscape_upload_network.png
diff --git a/docs/_static/images/hac-horizontal.png b/docs/_static/images/hac-horizontal.png
diff --git a/docs/_static/images/hac-vertical.png b/docs/_static/images/hac-vertical.png
diff --git a/docs/_static/images/jaccard-heatmap.png b/docs/_static/images/jaccard-heatmap.png
diff --git a/docs/_static/images/pca.png b/docs/_static/images/pca.png
diff --git a/docs/_static/images/summary-stats.png b/docs/_static/images/summary-stats.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -56,6 +56,15 @@ methods (PRMs) to omics data.
    contributing/index
    contributing/maintain
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Tutorials
+
+   tutorial/introduction
+   tutorial/beginner
+   tutorial/intermediate
+   tutorial/advanced
+
 Indices and tables
 ==================
 

diff --git a/docs/tutorial/advanced.rst b/docs/tutorial/advanced.rst
@@ -0,0 +1,31 @@
+Advanced Capabilities and Features
+======================================
+
+More like these are all the things we can do with this, but will not be showing
+
+- mention parameter tuning
+- say that parameters are not preset and need to be tuned for each dataset
+
+CHTC integration
+
+Anything not included in the config file
+
+1. Global Workflow Control
+
+Sets options that apply to the entire workflow.
+
+- Examples: the container framework (docker, singularity, dsub) and where to pull container images from
+
+running spras with multiple parameter combinations with multiple algorithms on multiple Datasets
+- for the tutorial we are only doing one dataset
+
+4. Gold Standards
+
+Defines the input files SPRAS will use to evaluate output subnetworks
+
+A gold standard dataset is comprised of: 
+
+- a label: defines the name of the gold standard dataset
+- node_file or edge_file: a list of either node files or edge files. Only one or the other can exist in a single dataset. At the moment only one edge or one node file can exist in one dataset
+- data_dir: the path to where the input gold standard files live
+- dataset_labels: a list of dataset labels that link each gold standard links to one or more datasets via the dataset labels