skandlab · LudvigOlsen · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/README.md b/README.md
@@ -4,7 +4,6 @@ Fragle is a fast and versatile method for detection and quantification of circul
   <img src="Fragle_Overview.png" alt="Fragle_Overview" width="70%">
 </div>
 
-
 ## Installation
 - Download and unzip the Fragle source code (https://github.com/skandlab/FRAGLE/archive/refs/heads/main.zip).
 
@@ -53,25 +52,23 @@ Inside the Fragle software folder, run the follwing commands:
 - Command Line Argument Description:
     - **INPUT_FOLDER**: Input folder path full of bam files and corresponding bai files [required string type argument]
     - **OUTPUT_FOLDER**: Output folder path where the Fragle predictions, processed features, and off-target bam files (in case of targeted sequencing bam files) will be generated [required string type argument]
-    - **MODE**: 3 possible inputs for mode selection [required string type argument]:
+    - **MODE**: [required string type argument]:
         - 'R': run Fragle on raw WGS bam files (or off-target bam files).
         - 'T': run Fragle on targeted sequencing bam files (containing both on and off-target reads); this mode also requires the **TARGET_BED** option.
         - 'F': run Fragle directly on processed features obtained from raw bam files (e.g., `Output/data.pkl`).
     - **GENOME_BUILD**: Specifiy reference genome version. [optional string type argument]
         - 'hg19': (default option) if your bam is mapped to hg19 or GRCh37 reference genome.
         - 'hg38': if your bam is mapped to hg38 reference genome.
     - **TARGET_BED**: bed file path for targeted sequencing bam file [optional string type argument]
-        - This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data.
+        - This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data files.
         - The bed file is used to derive the off-target bam files from targeted sequencing data (the off-target bam files will be generated inside the **OUTPUT_FOLDER**).
     - **CPU**: Number of CPU cores to use for parallel processing [integer type optional argument, default: 32]
-        - If your running environment has multiple processors, setting the CPU to a higher value (e.g., 16 or 32) is recommended.
+        - If your running environment has multiple processors/cores, setting CPU to a value (e.g., 16 or 32) corresponding to number of available cores is recommended.
         - A higher CPU core value (if available) will significantly speed up the software execution.
     - **THREADS**: Number of threads to use for off-target bam file extraction [integer type optional argument, default: 32]
         - This argument is only utilized when the 'T' option is provided, meaning that you are running Fragle on targeted sequencing data.
         - A higher THREADS (if available) value will make the off-target bam extraction process significantly faster.
 
-
-
 ## Example Running Commands for Fragle
 - **Running Fragle on hg19 mapped WGS BAM files:**
     ```bash

diff --git a/feature_generation.py b/feature_generation.py
@@ -1,41 +1,44 @@
+import argparse
 import os
+import pathlib
 import subprocess
 import numpy as np
-import pandas as pd
 import pickle
-import copy, sys
 
+parser = argparse.ArgumentParser()
+parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to one or more `.bam` files.')
+parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found')
+parser.add_argument('--cpu', type=int, required=True, help='Number of CPUs to use for parallel processing of bam files')
+parser.add_argument('--bin_locations', type=str, required=True, help='Path to file with bin locations.')
+args = parser.parse_args()
 
-input_folder = sys.argv[1]
-output_folder = sys.argv[2]
-CPU = sys.argv[3]
-bin_locations = sys.argv[4]
-if os.path.exists(output_folder)==False:
-  os.mkdir(output_folder)
+sample_cnt = len(args.input)
+
+if not os.path.exists(args.output):
+    os.mkdir(args.output)
 
-sample_cnt = 0
-for file_ in os.listdir(input_folder):
-  if file_.endswith('.bam'):
-    sample_cnt += 1
 data = np.zeros((sample_cnt, 350))
 data_meta = []
 
 i = 0
-for file_ in os.listdir(input_folder):
-  if file_.endswith('.bam'):
-    bamfile = f'{input_folder}/{file_}'
-    output_path = f'{output_folder}/{file_[:-4]}.npy'
-    print(i+1, file_)
+for file_ in args.input:
+    filename = pathlib.Path(file_).name
+    if filename[-4:] != ".bam":
+        raise ValueError(
+            "Feature Generation: Input file path did not end with '.bam'."
+        )  # Shouldn't happen
+    output_path = f"{args.output}/{filename[:-4]}.npy"
+    print(i + 1, file_)
     data_meta.append(file_[:-4])
-    command = f'python sample_feature_generation.py {bamfile} {output_path} {CPU} {bin_locations}'
+    command = f"python sample_feature_generation.py {file_} {output_path} {args.cpu} {args.bin_locations}"
     subprocess.run(command, shell=True)
-    data[i]= np.load(output_path)
+    data[i] = np.load(output_path)
     os.remove(output_path)
     i += 1
 
 data_dict = {}
-data_dict['samples'] = data
-data_dict['meta_info'] = data_meta
-output_file = f'{output_folder}/data.pkl' 
-with open(output_file, 'wb') as f:
-  pickle.dump(data_dict, f)
+data_dict["samples"] = data
+data_dict["meta_info"] = data_meta
+output_file = f"{args.output}/data.pkl"
+with open(output_file, "wb") as f:
+    pickle.dump(data_dict, f)
diff --git a/main.py b/main.py
@@ -1,54 +1,90 @@
-import os, argparse
+import os
+import argparse
 import subprocess
+import pathlib
 
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--input', type=str, required=True, help='Input folder path full of bam files')
+parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to either a single input folder with bam files, to one or more `.bam` files, or to a single data.pkl file with processed features.')
 parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found')
-parser.add_argument('--mode', type=str, required=True, help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file')
+parser.add_argument('--mode', type=str, required=True, choices=["R", "T", "F"], help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file')
 parser.add_argument('--genome_build', type=str, default='hg19', help='reference genome version (hg19/GRCh37/hg38) to which your input bam files have been mapped to')
 parser.add_argument('--target_bed', type=str, default='empty.bed', help='bed file for targeted sequencing (only utilized when T option is used)')
 parser.add_argument('--cpu', type=int, default=32, help='Number of CPUs to use for parallel processing of bam files')
 parser.add_argument('--threads', type=int, default=32, help='Number of threads to use for off-target bam file extraction from targeted bam files (only utilized when T option is used)')
-
 args = parser.parse_args()
-input_folder = args.input + '/'
-output_folder = args.output + '/'
-option = args.mode
-genome_build = args.genome_build
-bed_file = args.target_bed
-CPU = args.cpu
-num_threads = args.threads
-
-bin_locations = 'meta_info/hg19_bin_locations.csv'
-if genome_build=='hg38':
-    bin_locations = 'meta_info/hg38_bin_locations.csv'
-
-if option=='R':
-    command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations)
-    subprocess.run(command, shell=True)
-    command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder
-    subprocess.run(command, shell=True)
 
-elif option=='F':
-    command = 'python predict.py {input} {output}'.format(input=input_folder, output=output_folder) # data.pkl file inside input_folder folder
-    subprocess.run(command, shell=True)
+# Extract bam file paths or the data.pkl path
+
+input_bam_paths = None
+input_pkl_path = None
+if len(args.input) == 1 and args.input[0][-4:].lower() not in [".bam", ".pkl"]:
+    # Input folder specified
+    input_folder = pathlib.Path(args.input[0])
+    input_bam_paths = [path.resolve() for path in input_folder.glob("*.bam")]
+    if len(input_bam_paths) == 0:
+        raise RuntimeError(
+            f"--input: No `.bam` files were found in the input folder: {input_folder}"
+        )
+
+elif len(args.input) == 1 and args.input[0][-4:].lower() == ".pkl":
+    # Already processed features
+    input_pkl_path = pathlib.Path(args.input[0])
+
+else:
+    # Direct paths to bam files
+    if any([path[-4:].lower() != ".bam" for path in args.input]):
+        raise ValueError(
+            "--input: When specifying one or more `.bam` files, "
+            "all paths in --input must end with '.bam'."
+        )
+    input_bam_paths = [pathlib.Path(path).resolve() for path in args.input]
+
+# Check that we have the right input paths for the right options
+if args.mode in ["R", "T"] and input_bam_paths is None:
+    raise ValueError(
+        "When --mode is 'R' or 'T', --input must specify the location(s) of `.bed` files."
+    )
+if args.mode == "F" and input_pkl_path is None:
+    raise ValueError(
+        "When --mode is 'F', --input must specify the location of the `data.pkl` file with processed features."
+    )
 
-elif option=='T':
-    off_target_folder = output_folder + 'off_target_bams/'
-    if os.path.isdir(off_target_folder)==False:
+output_folder = args.output
+if output_folder[-1] != "/":
+    output_folder += "/"
+
+bin_locations = "meta_info/hg19_bin_locations.csv"
+if args.genome_build == "hg38":
+    bin_locations = "meta_info/hg38_bin_locations.csv"
+
+# Create off-target bam files
+if args.mode == "T":
+    off_target_folder = output_folder + "off_target_bams/"
+    if not os.path.isdir(off_target_folder):
         os.mkdir(off_target_folder)
-    for file_ in os.listdir(input_folder):
-        if file_.endswith('.bam'):
-            input_file = input_folder + file_
-            output_file = off_target_folder + file_
-            command = 'samtools view -b -h -@ {num_threads} -o /dev/null -U {off_bam} -L {bed_file} {in_bam}'.format(num_threads=num_threads, off_bam=output_file, bed_file=bed_file, in_bam=input_file)
-            subprocess.run(command, shell=True)
-            command = 'samtools index -@ {num_threads} -b {file_}'.format(num_threads=num_threads, file_=output_file)
-            subprocess.run(command, shell=True)
-
-    input_folder = off_target_folder
-    command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations)
-    subprocess.run(command, shell=True)
-    command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder
+    off_target_files = []
+    for file_ in input_bam_paths:
+        output_file = off_target_folder + file_.name
+        command = f"samtools view -b -h -@ {args.threads} -o /dev/null -U {output_file} -L {args.target_bed} {file_}"
+        subprocess.run(command, shell=True)
+        command = f"samtools index -@ {args.threads} -b {output_file}"
+        subprocess.run(command, shell=True)
+        off_target_files += output_file
+
+    # Use these for feature generation and prediction
+    input_bam_paths = off_target_files
+
+# Extract features from the bam files
+if args.mode in ["R", "T"]:
+    # Concatenate bam paths to a single string
+    input_bam_paths_str = " ".join([str(path) for path in input_bam_paths])
+    command = f"python feature_generation.py --input {input_bam_paths_str} --output {output_folder} --cpu {args.cpu} --bin_locations {bin_locations}"
     subprocess.run(command, shell=True)
+
+    # Use these features (located in the output_folder) for prediction
+    input_pkl_path = pathlib.Path(output_folder) / "data.pkl"
+
+# Run the prediction
+command = f"python predict.py --input {input_pkl_path} --output {output_folder}"  # data.pkl file inside output_folder folder
+subprocess.run(command, shell=True)
diff --git a/predict.py b/predict.py
@@ -1,28 +1,23 @@
 # Importing libraries
 # sklearn version 1.2.2 required: "pip install scikit-learn==1.2.2"
-import json
-import pandas as pd
-import csv
+import argparse
 import os
-import sys
+import pandas as pd
 import pickle
 import numpy as np
-import random
-import math
 import torch
-import torchvision
 import torch.nn.functional as F
-import torchvision.datasets as datasets
-import torchvision.transforms as transforms
 from torch import nn
-from torch.utils.data import DataLoader
-from torch.utils.data import Dataset, DataLoader
-from joblib import dump, load
+from joblib import load
 
 
-# Input and Output folder path
-input_folder = sys.argv[1]
-output_folder = sys.argv[2]
+parser = argparse.ArgumentParser()
+parser.add_argument('--input', type=str, required=True, help='Path to a data.pkl file with processed features.')
+parser.add_argument('--output', type=str, required=True, help='Output folder to store the Fragle predictions at.')
+args = parser.parse_args()
+
+if not os.path.exists(args.output):
+    os.mkdir(args.output)
 
 # loading meta_info for prediction
 device = torch.device("cpu")
@@ -124,8 +119,7 @@ def load_model(model_path):
 
 # data loading and feature generation
 data_dict = {}
-data_path = input_folder + 'data.pkl'
-with open(data_path, 'rb') as f:
+with open(args.input, 'rb') as f:
   data_dict = pickle.load(f)
 data = data_dict['samples']
 sample_names = data_dict['meta_info']
@@ -138,7 +132,6 @@ def load_model(model_path):
         print(f'WARNING: too few reads in sample {sample_names[i]}')
 
 
-
 data_X = np.copy(data)
 data_X = data_X + 1
 data_X = data_X / np.max(data_X, axis=1)[:, np.newaxis]
@@ -168,10 +161,10 @@ def predict_tf(model, model_type):
             score, _ = model(dataX.to(device))
             if model_type=='LT':
                 score = score.item()
-                csv_path = output_folder + 'LT.csv'
+                csv_path = args.output + 'LT.csv'
             elif model_type=='HT':
                 score = score.item() * TF_std + TF_median
-                csv_path = output_folder + 'HT.csv'
+                csv_path = args.output + 'HT.csv'
             csv_list.append([sample_names[i], score])
             # csv_list.append([sample_names[i][0], sample_names[i][1], score])
     df = pd.DataFrame(csv_list)
@@ -197,7 +190,7 @@ def predict_tf(model, model_type):
     csv_list.append([sample_id, score])
     # csv_list.append([cohort, sample_id, score]) 
 
-filePath = output_folder + 'Fragle.csv'
+filePath = args.output + 'Fragle.csv'
 df = pd.DataFrame(csv_list)
 df.to_csv(filePath, index=False, header=['Sample_ID', 'ctDNA_Burden'])
 # df.to_csv(filePath, index=False, header=['Cohort', 'Sample_ID', 'ctDNA_Burden'])