Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ Fragle is a fast and versatile method for detection and quantification of circul
<img src="Fragle_Overview.png" alt="Fragle_Overview" width="70%">
</div>


## Installation
- Download and unzip the Fragle source code (https://github.com/skandlab/FRAGLE/archive/refs/heads/main.zip).

Expand Down Expand Up @@ -53,25 +52,23 @@ Inside the Fragle software folder, run the follwing commands:
- Command Line Argument Description:
- **INPUT_FOLDER**: Input folder path full of bam files and corresponding bai files [required string type argument]
- **OUTPUT_FOLDER**: Output folder path where the Fragle predictions, processed features, and off-target bam files (in case of targeted sequencing bam files) will be generated [required string type argument]
- **MODE**: 3 possible inputs for mode selection [required string type argument]:
- **MODE**: [required string type argument]:
- 'R': run Fragle on raw WGS bam files (or off-target bam files).
- 'T': run Fragle on targeted sequencing bam files (containing both on and off-target reads); this mode also requires the **TARGET_BED** option.
- 'F': run Fragle directly on processed features obtained from raw bam files (e.g., `Output/data.pkl`).
- **GENOME_BUILD**: Specifiy reference genome version. [optional string type argument]
- 'hg19': (default option) if your bam is mapped to hg19 or GRCh37 reference genome.
- 'hg38': if your bam is mapped to hg38 reference genome.
- **TARGET_BED**: bed file path for targeted sequencing bam file [optional string type argument]
- This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data.
- This argument is only used when 'T' option is provided, meaning that you are running Fragle on targeted sequencing data files.
- The bed file is used to derive the off-target bam files from targeted sequencing data (the off-target bam files will be generated inside the **OUTPUT_FOLDER**).
- **CPU**: Number of CPU cores to use for parallel processing [integer type optional argument, default: 32]
- If your running environment has multiple processors, setting the CPU to a higher value (e.g., 16 or 32) is recommended.
- If your running environment has multiple processors/cores, setting CPU to a value (e.g., 16 or 32) corresponding to number of available cores is recommended.
- A higher CPU core value (if available) will significantly speed up the software execution.
- **THREADS**: Number of threads to use for off-target bam file extraction [integer type optional argument, default: 32]
- This argument is only utilized when the 'T' option is provided, meaning that you are running Fragle on targeted sequencing data.
- A higher THREADS (if available) value will make the off-target bam extraction process significantly faster.



## Example Running Commands for Fragle
- **Running Fragle on hg19 mapped WGS BAM files:**
```bash
Expand Down
51 changes: 27 additions & 24 deletions feature_generation.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,44 @@
import argparse
import os
import pathlib
import subprocess
import numpy as np
import pandas as pd
import pickle
import copy, sys

parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to one or more `.bam` files.')
parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found')
parser.add_argument('--cpu', type=int, required=True, help='Number of CPUs to use for parallel processing of bam files')
parser.add_argument('--bin_locations', type=str, required=True, help='Path to file with bin locations.')
args = parser.parse_args()

input_folder = sys.argv[1]
output_folder = sys.argv[2]
CPU = sys.argv[3]
bin_locations = sys.argv[4]
if os.path.exists(output_folder)==False:
os.mkdir(output_folder)
sample_cnt = len(args.input)

if not os.path.exists(args.output):
os.mkdir(args.output)

sample_cnt = 0
for file_ in os.listdir(input_folder):
if file_.endswith('.bam'):
sample_cnt += 1
data = np.zeros((sample_cnt, 350))
data_meta = []

i = 0
for file_ in os.listdir(input_folder):
if file_.endswith('.bam'):
bamfile = f'{input_folder}/{file_}'
output_path = f'{output_folder}/{file_[:-4]}.npy'
print(i+1, file_)
for file_ in args.input:
filename = pathlib.Path(file_).name
if filename[-4:] != ".bam":
raise ValueError(
"Feature Generation: Input file path did not end with '.bam'."
) # Shouldn't happen
output_path = f"{args.output}/{filename[:-4]}.npy"
print(i + 1, file_)
data_meta.append(file_[:-4])
command = f'python sample_feature_generation.py {bamfile} {output_path} {CPU} {bin_locations}'
command = f"python sample_feature_generation.py {file_} {output_path} {args.cpu} {args.bin_locations}"
subprocess.run(command, shell=True)
data[i]= np.load(output_path)
data[i] = np.load(output_path)
os.remove(output_path)
i += 1

data_dict = {}
data_dict['samples'] = data
data_dict['meta_info'] = data_meta
output_file = f'{output_folder}/data.pkl'
with open(output_file, 'wb') as f:
pickle.dump(data_dict, f)
data_dict["samples"] = data
data_dict["meta_info"] = data_meta
output_file = f"{args.output}/data.pkl"
with open(output_file, "wb") as f:
pickle.dump(data_dict, f)
116 changes: 76 additions & 40 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,90 @@
import os, argparse
import os
import argparse
import subprocess
import pathlib


parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True, help='Input folder path full of bam files')
parser.add_argument('--input', type=str, nargs="*", required=True, help='Path(s) to either a single input folder with bam files, to one or more `.bam` files, or to a single data.pkl file with processed features.')
parser.add_argument('--output', type=str, required=True, help='Output folder path where the Fragle predictions and processed features can be found')
parser.add_argument('--mode', type=str, required=True, help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file')
parser.add_argument('--mode', type=str, required=True, choices=["R", "T", "F"], help='3 options: (1) F -> output from processed feature, (2) R -> output from raw WGS/off-target bam file, (3) T -> output from targeted sequencing bam file')
parser.add_argument('--genome_build', type=str, default='hg19', help='reference genome version (hg19/GRCh37/hg38) to which your input bam files have been mapped to')
parser.add_argument('--target_bed', type=str, default='empty.bed', help='bed file for targeted sequencing (only utilized when T option is used)')
parser.add_argument('--cpu', type=int, default=32, help='Number of CPUs to use for parallel processing of bam files')
parser.add_argument('--threads', type=int, default=32, help='Number of threads to use for off-target bam file extraction from targeted bam files (only utilized when T option is used)')

args = parser.parse_args()
input_folder = args.input + '/'
output_folder = args.output + '/'
option = args.mode
genome_build = args.genome_build
bed_file = args.target_bed
CPU = args.cpu
num_threads = args.threads

bin_locations = 'meta_info/hg19_bin_locations.csv'
if genome_build=='hg38':
bin_locations = 'meta_info/hg38_bin_locations.csv'

if option=='R':
command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations)
subprocess.run(command, shell=True)
command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder
subprocess.run(command, shell=True)

elif option=='F':
command = 'python predict.py {input} {output}'.format(input=input_folder, output=output_folder) # data.pkl file inside input_folder folder
subprocess.run(command, shell=True)
# Extract bam file paths or the data.pkl path

input_bam_paths = None
input_pkl_path = None
if len(args.input) == 1 and args.input[0][-4:].lower() not in [".bam", ".pkl"]:
# Input folder specified
input_folder = pathlib.Path(args.input[0])
input_bam_paths = [path.resolve() for path in input_folder.glob("*.bam")]
if len(input_bam_paths) == 0:
raise RuntimeError(
f"--input: No `.bam` files were found in the input folder: {input_folder}"
)

elif len(args.input) == 1 and args.input[0][-4:].lower() == ".pkl":
# Already processed features
input_pkl_path = pathlib.Path(args.input[0])

else:
# Direct paths to bam files
if any([path[-4:].lower() != ".bam" for path in args.input]):
raise ValueError(
"--input: When specifying one or more `.bam` files, "
"all paths in --input must end with '.bam'."
)
input_bam_paths = [pathlib.Path(path).resolve() for path in args.input]

# Check that we have the right input paths for the right options
if args.mode in ["R", "T"] and input_bam_paths is None:
raise ValueError(
"When --mode is 'R' or 'T', --input must specify the location(s) of `.bed` files."
)
if args.mode == "F" and input_pkl_path is None:
raise ValueError(
"When --mode is 'F', --input must specify the location of the `data.pkl` file with processed features."
)

elif option=='T':
off_target_folder = output_folder + 'off_target_bams/'
if os.path.isdir(off_target_folder)==False:
output_folder = args.output
if output_folder[-1] != "/":
output_folder += "/"

bin_locations = "meta_info/hg19_bin_locations.csv"
if args.genome_build == "hg38":
bin_locations = "meta_info/hg38_bin_locations.csv"

# Create off-target bam files
if args.mode == "T":
off_target_folder = output_folder + "off_target_bams/"
if not os.path.isdir(off_target_folder):
os.mkdir(off_target_folder)
for file_ in os.listdir(input_folder):
if file_.endswith('.bam'):
input_file = input_folder + file_
output_file = off_target_folder + file_
command = 'samtools view -b -h -@ {num_threads} -o /dev/null -U {off_bam} -L {bed_file} {in_bam}'.format(num_threads=num_threads, off_bam=output_file, bed_file=bed_file, in_bam=input_file)
subprocess.run(command, shell=True)
command = 'samtools index -@ {num_threads} -b {file_}'.format(num_threads=num_threads, file_=output_file)
subprocess.run(command, shell=True)

input_folder = off_target_folder
command = 'python feature_generation.py {input} {output} {CPU} {bin_locations}'.format(input=input_folder, output=output_folder, CPU=CPU, bin_locations=bin_locations)
subprocess.run(command, shell=True)
command = 'python predict.py {input} {output}'.format(input=output_folder, output=output_folder) # data.pkl file inside output_folder folder
off_target_files = []
for file_ in input_bam_paths:
output_file = off_target_folder + file_.name
command = f"samtools view -b -h -@ {args.threads} -o /dev/null -U {output_file} -L {args.target_bed} {file_}"
subprocess.run(command, shell=True)
command = f"samtools index -@ {args.threads} -b {output_file}"
subprocess.run(command, shell=True)
off_target_files += output_file

# Use these for feature generation and prediction
input_bam_paths = off_target_files

# Extract features from the bam files
if args.mode in ["R", "T"]:
# Concatenate bam paths to a single string
input_bam_paths_str = " ".join([str(path) for path in input_bam_paths])
command = f"python feature_generation.py --input {input_bam_paths_str} --output {output_folder} --cpu {args.cpu} --bin_locations {bin_locations}"
subprocess.run(command, shell=True)

# Use these features (located in the output_folder) for prediction
input_pkl_path = pathlib.Path(output_folder) / "data.pkl"

# Run the prediction
command = f"python predict.py --input {input_pkl_path} --output {output_folder}" # data.pkl file inside output_folder folder
subprocess.run(command, shell=True)
35 changes: 14 additions & 21 deletions predict.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
# Importing libraries
# sklearn version 1.2.2 required: "pip install scikit-learn==1.2.2"
import json
import pandas as pd
import csv
import argparse
import os
import sys
import pandas as pd
import pickle
import numpy as np
import random
import math
import torch
import torchvision
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from joblib import dump, load
from joblib import load


# Input and Output folder path
input_folder = sys.argv[1]
output_folder = sys.argv[2]
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True, help='Path to a data.pkl file with processed features.')
parser.add_argument('--output', type=str, required=True, help='Output folder to store the Fragle predictions at.')
args = parser.parse_args()

if not os.path.exists(args.output):
os.mkdir(args.output)

# loading meta_info for prediction
device = torch.device("cpu")
Expand Down Expand Up @@ -124,8 +119,7 @@ def load_model(model_path):

# data loading and feature generation
data_dict = {}
data_path = input_folder + 'data.pkl'
with open(data_path, 'rb') as f:
with open(args.input, 'rb') as f:
data_dict = pickle.load(f)
data = data_dict['samples']
sample_names = data_dict['meta_info']
Expand All @@ -138,7 +132,6 @@ def load_model(model_path):
print(f'WARNING: too few reads in sample {sample_names[i]}')



data_X = np.copy(data)
data_X = data_X + 1
data_X = data_X / np.max(data_X, axis=1)[:, np.newaxis]
Expand Down Expand Up @@ -168,10 +161,10 @@ def predict_tf(model, model_type):
score, _ = model(dataX.to(device))
if model_type=='LT':
score = score.item()
csv_path = output_folder + 'LT.csv'
csv_path = args.output + 'LT.csv'
elif model_type=='HT':
score = score.item() * TF_std + TF_median
csv_path = output_folder + 'HT.csv'
csv_path = args.output + 'HT.csv'
csv_list.append([sample_names[i], score])
# csv_list.append([sample_names[i][0], sample_names[i][1], score])
df = pd.DataFrame(csv_list)
Expand All @@ -197,7 +190,7 @@ def predict_tf(model, model_type):
csv_list.append([sample_id, score])
# csv_list.append([cohort, sample_id, score])

filePath = output_folder + 'Fragle.csv'
filePath = args.output + 'Fragle.csv'
df = pd.DataFrame(csv_list)
df.to_csv(filePath, index=False, header=['Sample_ID', 'ctDNA_Burden'])
# df.to_csv(filePath, index=False, header=['Cohort', 'Sample_ID', 'ctDNA_Burden'])
Expand Down