-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing.py
More file actions
93 lines (73 loc) · 2.81 KB
/
preprocessing.py
File metadata and controls
93 lines (73 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""preprocessing.py: Data preprocessing
Usage:
python preprocessing.py expand_peaks -i <input bed file> -o <output bed file> -l 501
"""
import numpy as np
import pandas as pd
CENTERING_OPTIONS = ['summit', 'endpoints']
def main(args):
if args.function == 'expand_peaks':
expand_peaks(args.in_file, args.out_file, args.length, centering=args.centering)
else:
raise ValueError(f"Invalid args: {args}")
def expand_peaks(bed_file, out_file, length, centering='summit'):
"""Standardize bed file peaks to a uniform length.
Adapted from expand_peaks.py by Calvin Chen.
Actions:
- Expand peaks to the same length
- If centering == 'summit', then preserve the original summit locations
- If centering == 'endpoints', then preserve the original interval centers
- Drop duplicate peaks
Args:
bed_file (str)
out_file (str)
length (int)
centering (str): how to calculate the center of the new peaks
- if 'summit', then use the summit
- if 'endpoints', then use the original center
"""
if out_file == bed_file:
raise ValueError("Don't overwrite old bed file")
if centering not in CENTERING_OPTIONS:
raise ValueError(f"Invalid centering option `{centering}`, valid options are {CENTERING_OPTIONS}")
bed = pd.read_csv(bed_file, delim_whitespace=True, header=None)
has_summit_column = len(bed.columns) >= 10
if centering == 'summit' and not has_summit_column:
raise ValueError(f"Cannot use centering='summit' because bed file does not have summit column. "
"Check bed file or use centering='endpoints' if there is no summit column.")
# Get summit index, to recompute summit after recentering
if has_summit_column:
# bed[1]: left endpoint index
# bed[9]: summit offset
# midpoint = summit index
summit_idx = bed[1] + bed[9]
# Expand peaks to the same length
if centering == 'summit':
midpoint = summit_idx
elif centering == 'endpoints':
# bed[1]: left endpoint index
# bed[2]: right endpoint index
# midpoint = geometric midpoint of left and right, i.e. mean
midpoint = (bed[1] + bed[2]) / 2
else:
raise ValueError(f"Invalid centering option `{centering}`, valid options are {CENTERING_OPTIONS}")
# Recompute endpoints
bed[1] = np.floor(midpoint - length / 2).astype(int)
bed[2] = bed[1] + length
if has_summit_column:
# Recompute summit
bed[9] = summit_idx - bed[1]
# Drop duplicate peaks
bed = bed.drop_duplicates(subset=[0,1,2], keep='first', inplace=False)
bed.to_csv(out_file, index=False, sep="\t", header=None)
def get_args():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('function')
parser.add_argument('--in_file', '-i')
parser.add_argument('--out_file', '-o')
parser.add_argument('--length', '-l', type=int)
parser.add_argument('--centering', '-c', default='summit')
return parser.parse_args()
if __name__ == '__main__':
main(get_args())