diff --git a/README.md b/README.md index 383868b..3cf867e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ Fragle is a fast and versatile method for detection and quantification of circul Fragle_Overview - ## Installation - Download and unzip the Fragle source code (https://github.com/skandlab/FRAGLE/archive/refs/heads/main.zip). @@ -53,7 +52,7 @@ Inside the Fragle software folder, run the follwing commands: - Command Line Argument Description: - **INPUT_FOLDER**: Input folder path full of bam files and corresponding bai files [required string type argument] - **OUTPUT_FOLDER**: Output folder path where the Fragle predictions, processed features, and off-target bam files (in case of targeted sequencing bam files) will be generated [required string type argument] - - **MODE**: 3 possible inputs for mode selection [required string type argument]: + - **MODE**: [required string type argument]: - 'R': run Fragle on raw WGS bam files (or off-target bam files). - 'T': run Fragle on targeted sequencing bam files (containing both on and off-target reads); this mode also requires the **TARGET_BED** option. - 'F': run Fragle directly on processed features obtained from raw bam files (e.g., `Output/data.pkl`). @@ -61,17 +60,15 @@ Inside the Fragle software folder, run the follwing commands: - 'hg19': (default option) if your bam is mapped to hg19 or GRCh37 reference genome. - 'hg38': if your bam is mapped to hg38 reference genome. - **TARGET_BED**: bed file path for targeted sequencing bam file [optional string type argument] - - This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data. + - This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data files. - The bed file is used to derive the off-target bam files from targeted sequencing data (the off-target bam files will be generated inside the **OUTPUT_FOLDER**). - **CPU**: Number of CPU cores to use for parallel processing [integer type optional argument, default: 32] - - If your running environment has multiple processors, setting the CPU to a higher value (e.g., 16 or 32) is recommended. + - If your running environment has multiple processors/cores, setting CPU to a value (e.g., 16 or 32) corresponding to number of available cores is recommended. - A higher CPU core value (if available) will significantly speed up the software execution. - **THREADS**: Number of threads to use for off-target bam file extraction [integer type optional argument, default: 32] - This argument is only utilized when the 'T' option is provided, meaning that you are running Fragle on targeted sequencing data. - A higher THREADS (if available) value will make the off-target bam extraction process significantly faster. - - ## Example Running Commands for Fragle - **Running Fragle on hg19 mapped WGS BAM files:** ```bash diff --git a/feature_generation.py b/feature_generation.py index 4c098fe..d3a84d1 100644 --- a/feature_generation.py +++ b/feature_generation.py @@ -1,41 +1,44 @@ +import argparse import os +import pathlib import subprocess import numpy as np -import pandas as pd import pickle -import copy, sys +parser = argparse.ArgumentParser() +parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to one or more `.bam` files.') +parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found') +parser.add_argument('--cpu', type=int, required=True, help='Number of CPUs to use for parallel processing of bam files') +parser.add_argument('--bin_locations', type=str, required=True, help='Path to file with bin locations.') +args = parser.parse_args() -input_folder = sys.argv[1] -output_folder = sys.argv[2] -CPU = sys.argv[3] -bin_locations = sys.argv[4] -if os.path.exists(output_folder)==False: - os.mkdir(output_folder) +sample_cnt = len(args.input) + +if not os.path.exists(args.output): + os.mkdir(args.output) -sample_cnt = 0 -for file_ in os.listdir(input_folder): - if file_.endswith('.bam'): - sample_cnt += 1 data = np.zeros((sample_cnt, 350)) data_meta = [] i = 0 -for file_ in os.listdir(input_folder): - if file_.endswith('.bam'): - bamfile = f'{input_folder}/{file_}' - output_path = f'{output_folder}/{file_[:-4]}.npy' - print(i+1, file_) +for file_ in args.input: + filename = pathlib.Path(file_).name + if filename[-4:] != ".bam": + raise ValueError( + "Feature Generation: Input file path did not end with '.bam'." + ) # Shouldn't happen + output_path = f"{args.output}/{filename[:-4]}.npy" + print(i + 1, file_) data_meta.append(file_[:-4]) - command = f'python sample_feature_generation.py {bamfile} {output_path} {CPU} {bin_locations}' + command = f"python sample_feature_generation.py {file_} {output_path} {args.cpu} {args.bin_locations}" subprocess.run(command, shell=True) - data[i]= np.load(output_path) + data[i] = np.load(output_path) os.remove(output_path) i += 1 data_dict = {} -data_dict['samples'] = data -data_dict['meta_info'] = data_meta -output_file = f'{output_folder}/data.pkl' -with open(output_file, 'wb') as f: - pickle.dump(data_dict, f) +data_dict["samples"] = data +data_dict["meta_info"] = data_meta +output_file = f"{args.output}/data.pkl" +with open(output_file, "wb") as f: + pickle.dump(data_dict, f) diff --git a/main.py b/main.py index 4d0df61..c03bb0f 100644 --- a/main.py +++ b/main.py @@ -1,54 +1,90 @@ -import os, argparse +import os +import argparse import subprocess +import pathlib parser = argparse.ArgumentParser() -parser.add_argument('--input', type=str, required=True, help='Input folder path full of bam files') +parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to either a single input folder with bam files, to one or more `.bam` files, or to a single data.pkl file with processed features.') parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found') -parser.add_argument('--mode', type=str, required=True, help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file') +parser.add_argument('--mode', type=str, required=True, choices=["R", "T", "F"], help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file') parser.add_argument('--genome_build', type=str, default='hg19', help='reference genome version (hg19/GRCh37/hg38) to which your input bam files have been mapped to') parser.add_argument('--target_bed', type=str, default='empty.bed', help='bed file for targeted sequencing (only utilized when T option is used)') parser.add_argument('--cpu', type=int, default=32, help='Number of CPUs to use for parallel processing of bam files') parser.add_argument('--threads', type=int, default=32, help='Number of threads to use for off-target bam file extraction from targeted bam files (only utilized when T option is used)') - args = parser.parse_args() -input_folder = args.input + '/' -output_folder = args.output + '/' -option = args.mode -genome_build = args.genome_build -bed_file = args.target_bed -CPU = args.cpu -num_threads = args.threads - -bin_locations = 'meta_info/hg19_bin_locations.csv' -if genome_build=='hg38': - bin_locations = 'meta_info/hg38_bin_locations.csv' - -if option=='R': - command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations) - subprocess.run(command, shell=True) - command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder - subprocess.run(command, shell=True) -elif option=='F': - command = 'python predict.py {input} {output}'.format(input=input_folder, output=output_folder) # data.pkl file inside input_folder folder - subprocess.run(command, shell=True) +# Extract bam file paths or the data.pkl path + +input_bam_paths = None +input_pkl_path = None +if len(args.input) == 1 and args.input[0][-4:].lower() not in [".bam", ".pkl"]: + # Input folder specified + input_folder = pathlib.Path(args.input[0]) + input_bam_paths = [path.resolve() for path in input_folder.glob("*.bam")] + if len(input_bam_paths) == 0: + raise RuntimeError( + f"--input: No `.bam` files were found in the input folder: {input_folder}" + ) + +elif len(args.input) == 1 and args.input[0][-4:].lower() == ".pkl": + # Already processed features + input_pkl_path = pathlib.Path(args.input[0]) + +else: + # Direct paths to bam files + if any([path[-4:].lower() != ".bam" for path in args.input]): + raise ValueError( + "--input: When specifying one or more `.bam` files, " + "all paths in --input must end with '.bam'." + ) + input_bam_paths = [pathlib.Path(path).resolve() for path in args.input] + +# Check that we have the right input paths for the right options +if args.mode in ["R", "T"] and input_bam_paths is None: + raise ValueError( + "When --mode is 'R' or 'T', --input must specify the location(s) of `.bed` files." + ) +if args.mode == "F" and input_pkl_path is None: + raise ValueError( + "When --mode is 'F', --input must specify the location of the `data.pkl` file with processed features." + ) -elif option=='T': - off_target_folder = output_folder + 'off_target_bams/' - if os.path.isdir(off_target_folder)==False: +output_folder = args.output +if output_folder[-1] != "/": + output_folder += "/" + +bin_locations = "meta_info/hg19_bin_locations.csv" +if args.genome_build == "hg38": + bin_locations = "meta_info/hg38_bin_locations.csv" + +# Create off-target bam files +if args.mode == "T": + off_target_folder = output_folder + "off_target_bams/" + if not os.path.isdir(off_target_folder): os.mkdir(off_target_folder) - for file_ in os.listdir(input_folder): - if file_.endswith('.bam'): - input_file = input_folder + file_ - output_file = off_target_folder + file_ - command = 'samtools view -b -h -@ {num_threads} -o /dev/null -U {off_bam} -L {bed_file} {in_bam}'.format(num_threads=num_threads, off_bam=output_file, bed_file=bed_file, in_bam=input_file) - subprocess.run(command, shell=True) - command = 'samtools index -@ {num_threads} -b {file_}'.format(num_threads=num_threads, file_=output_file) - subprocess.run(command, shell=True) - - input_folder = off_target_folder - command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations) - subprocess.run(command, shell=True) - command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder + off_target_files = [] + for file_ in input_bam_paths: + output_file = off_target_folder + file_.name + command = f"samtools view -b -h -@ {args.threads} -o /dev/null -U {output_file} -L {args.target_bed} {file_}" + subprocess.run(command, shell=True) + command = f"samtools index -@ {args.threads} -b {output_file}" + subprocess.run(command, shell=True) + off_target_files += output_file + + # Use these for feature generation and prediction + input_bam_paths = off_target_files + +# Extract features from the bam files +if args.mode in ["R", "T"]: + # Concatenate bam paths to a single string + input_bam_paths_str = " ".join([str(path) for path in input_bam_paths]) + command = f"python feature_generation.py --input {input_bam_paths_str} --output {output_folder} --cpu {args.cpu} --bin_locations {bin_locations}" subprocess.run(command, shell=True) + + # Use these features (located in the output_folder) for prediction + input_pkl_path = pathlib.Path(output_folder) / "data.pkl" + +# Run the prediction +command = f"python predict.py --input {input_pkl_path} --output {output_folder}" # data.pkl file inside output_folder folder +subprocess.run(command, shell=True) diff --git a/predict.py b/predict.py index 6158d7d..e4b9e84 100644 --- a/predict.py +++ b/predict.py @@ -1,28 +1,23 @@ # Importing libraries # sklearn version 1.2.2 required: "pip install scikit-learn==1.2.2" -import json -import pandas as pd -import csv +import argparse import os -import sys +import pandas as pd import pickle import numpy as np -import random -import math import torch -import torchvision import torch.nn.functional as F -import torchvision.datasets as datasets -import torchvision.transforms as transforms from torch import nn -from torch.utils.data import DataLoader -from torch.utils.data import Dataset, DataLoader -from joblib import dump, load +from joblib import load -# Input and Output folder path -input_folder = sys.argv[1] -output_folder = sys.argv[2] +parser = argparse.ArgumentParser() +parser.add_argument('--input', type=str, required=True, help='Path to a data.pkl file with processed features.') +parser.add_argument('--output', type=str, required=True, help='Output folder to store the Fragle predictions at.') +args = parser.parse_args() + +if not os.path.exists(args.output): + os.mkdir(args.output) # loading meta_info for prediction device = torch.device("cpu") @@ -124,8 +119,7 @@ def load_model(model_path): # data loading and feature generation data_dict = {} -data_path = input_folder + 'data.pkl' -with open(data_path, 'rb') as f: +with open(args.input, 'rb') as f: data_dict = pickle.load(f) data = data_dict['samples'] sample_names = data_dict['meta_info'] @@ -138,7 +132,6 @@ def load_model(model_path): print(f'WARNING: too few reads in sample {sample_names[i]}') - data_X = np.copy(data) data_X = data_X + 1 data_X = data_X / np.max(data_X, axis=1)[:, np.newaxis] @@ -168,10 +161,10 @@ def predict_tf(model, model_type): score, _ = model(dataX.to(device)) if model_type=='LT': score = score.item() - csv_path = output_folder + 'LT.csv' + csv_path = args.output + 'LT.csv' elif model_type=='HT': score = score.item() * TF_std + TF_median - csv_path = output_folder + 'HT.csv' + csv_path = args.output + 'HT.csv' csv_list.append([sample_names[i], score]) # csv_list.append([sample_names[i][0], sample_names[i][1], score]) df = pd.DataFrame(csv_list) @@ -197,7 +190,7 @@ def predict_tf(model, model_type): csv_list.append([sample_id, score]) # csv_list.append([cohort, sample_id, score]) -filePath = output_folder + 'Fragle.csv' +filePath = args.output + 'Fragle.csv' df = pd.DataFrame(csv_list) df.to_csv(filePath, index=False, header=['Sample_ID', 'ctDNA_Burden']) # df.to_csv(filePath, index=False, header=['Cohort', 'Sample_ID', 'ctDNA_Burden'])