-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcompute_perm_pvals_conditional.py
More file actions
100 lines (88 loc) · 3.58 KB
/
compute_perm_pvals_conditional.py
File metadata and controls
100 lines (88 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
#foo.py [file with p-vals] [number of folders] [folders with random results] [output file]
#Script to compute permulation p-values from ocr_phylo(g)lm.r results,
#using the method of Kulinskaya, 2008, in which permulations in the other
#direction than the initial trial are rejected.
#Takes at least four required positional arguments in order:
# The result CSV file of ocr_phylo(g)lm.r run once without permulations for every OCR,
# The number of directories containing results files
# Directories containing results file(s) of ocr_phylo(g)lm run with permulations on
# a subset of the OCRs in the first file. The number of directories provided should
# equal the previous argument.
# An output CSV file, which will be a copy of the input file with additional columns giving
# the permulations p-value and number of trials performed for each OCR
import os, sys
from math import log10, ceil
def parseNum(s):
te = s.split("e")
if len(te) == 2:
num = float(te[0])
base = int(te[1])
else:
num = float(s)
base = 0
return [num, base]
def leq(a, b):
randNum, randBase = a
realNum, realBase = b
return randBase < realBase or (realBase == randBase and randNum <= realNum)
inFile = open(sys.argv[1], "r")
new_header = inFile.readline().strip() + ",\"Exp_Pvalue\",\"Trials\",\"Missing_Trials\"\n" #Name, P-val, Coeff (can be multiple), Exp P-val, Trials, Missing_Trials
names = []
pvals = []
parsed_pvals = {}
#corPvals = []
coeffs = []
additional_coeffs = []
coeffs_negative = {}
for line in inFile:
tokens = line.strip().replace("\"", "").split(",")
if len(tokens) >= 3:
names.append(tokens[0])
pvals.append(tokens[1])
parsed_pvals[tokens[0]] = parseNum(tokens[1])
#corPvals.append(tokens[2])
coeffs.append(tokens[2])
coeffs_negative[tokens[0]] = tokens[2][0] == "-"
if len(tokens) > 3:
# Add the additional coefficients to a list
additional_coeffs.append(tokens[3:len(tokens)])
else:
print("Warning: The following line in the p-value file has an incorrect format")
print(line)
inFile.close()
lower_trials = {}
count_trials = {}
for name in names:
lower_trials[name] = 1
count_trials[name] = 1
nDir = int(sys.argv[2])
for i in range(nDir):
inDir = sys.argv[3+i]
for f in os.listdir(inDir):
inFile = open(inDir + "/" + f, "r")
header = inFile.readline().strip().replace("\"", "").split(",")
column = header.index("Pvalue")
column2 = header.index("Coeff")
for line in inFile:
tokens = line.strip().replace("\"", "").split(",")
name = tokens[0]
if ((tokens[column2][0] == "-") != coeffs_negative[name]) or tokens[column2] == "0":
continue #Reject
pval = tokens[column]
count_trials[name] = 1 + count_trials[name]
if leq(parseNum(pval), parsed_pvals[name]):
lower_trials[name] = lower_trials[name] + 1
outFile = open(sys.argv[3+nDir], "w")
outFile.write(new_header)
for i in range(len(names)):
name = names[i]
outFile.write(",".join([name, pvals[i], coeffs[i]]))
if len(additional_coeffs) > 0:
# Record the additional coefficients
outFile.write(",")
outFile.write(",".join(additional_coeffs[i]))
trials = count_trials[name]
missing_trials = 10 ** max(3, ceil(log10(trials))) - trials
outFile.write("," + str(lower_trials[name] / count_trials[name]) + "," + str(trials) + "," + str(missing_trials) + "\n")
outFile.close()