-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
executable file
·167 lines (141 loc) · 5.47 KB
/
parser.py
File metadata and controls
executable file
·167 lines (141 loc) · 5.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from collections import defaultdict
import numpy as np
import semnet
from semnet import utils
def rowSplitter(row, separator):
""" Split the row of a path file based on a give separator. """
rowSplit = [i.strip(' ') for i in row.strip().split(separator)]
left = isoSplitter(rowSplit[0])
right = isoSplitter(rowSplit[1])
return(left, right)
def isoSplitter(lst, separator='|'):
""" Split isoforms that are concatenated with '|' """
genes = lst.split(' ')
for index, gene in enumerate(genes):
if '|' in gene:
genes[index] = tuple(gene.split('|'))
return genes
def parsePathFile(fname):
""" Iterate over a 'Path' file. Determine which genes covary and which
genes are sources and targets.
gene 1 <-> gene2: indicates two genes covary
gene 1 -> gene2 gene3: indicates gene1 regulates gene2 and gene3
gene 1 <- gene2 gene3: indicates gene2 and gene3 regulate gene1
"""
try:
paths = defaultdict(list)
covs = []
with open(fname) as FH:
for row in FH:
if '<->' in row:
left, right = rowSplitter(row, '<->')
covs.append(tuple(left+right))
elif '->' in row:
left, right = rowSplitter(row, '->')
for i in left:
paths[i].extend(right)
elif '<-' in row:
left, right = rowSplitter(row, '<-')
for i in right:
paths[i].extend(left)
return(paths, covs)
except IOError:
print "The file {0} does not exist.".format(fname)
def identifyEndogenous(paths):
""" Endogenous variables are those that have an arrow going into them. So
they are the values of the paths dictionary. I need to create a list of
these endogenous variables.
"""
yvar = set()
for genes in paths.values():
for gene in genes:
yvar.add(gene)
return sorted(yvar)
def identifyExogenous(yvar, paths):
""" Exogenous variables are those that do not have an arrow going into
them. So they are the keys in the paths dictionary that are not also values
(ie not in yvar).
"""
xvar = set()
for key in paths:
if key not in yvar:
xvar.add(key)
return sorted(xvar)
def buildBeta(yvar, paths):
""" Beta is a square matrix where the rows and columns are equal to the
endogenous genes. In other words, Beta is a 0,1 matrix of the relationships
between endogenous (yvar) genes.
"""
# flatten the yvar list to remove isoform grouping
fyvar = utils.flatten_list(yvar, 1)
# Initialize Beta as all 0's
nyvar = len(fyvar)
Beta = np.zeros((nyvar, nyvar))
# Iterate through yvar and see if there are any relationships
for colIndex, value in enumerate(yvar):
# If the current gene has a downstream target continue
if paths[value]:
# get isoform count
isoCnt = utils.isoCount(value)
# get flattened list of downstream targets
targets = utils.flatten_list(paths[value],1)
# Add ones to correct spot in Beta
for target in targets:
rowIndex = fyvar.index(target)
Beta[rowIndex, colIndex:colIndex+isoCnt] = 1
return Beta
def buildGamma(yvar, xvar, paths):
""" Gamma has 'yvar' rows and 'xvar' columns. Gamma is a 0,1 matrix of the
relationships between exogenous (xvar) to endogenous (yvar) genes.
"""
# flatten yvar and xvar to remove isoform grouping
fyvar = utils.flatten_list(yvar, 1)
fxvar = utils.flatten_list(xvar, 1)
# Initialize Gamma as all 0's
nyvar = len(fyvar)
nxvar = len(fxvar)
Gamma = np.zeros((nyvar, nxvar))
# Iterate through xvar and see if there are any relationships
for colIndex, value in enumerate(xvar):
# If the current gene has a downstream target continue
if paths[value]:
# get isoform count
isoCnt = utils.isoCount(value)
# get flattened list of downstream targets
targets = utils.flatten_list(paths[value],1)
# Add ones to correct spot in Gamma
for target in targets:
rowIndex = fyvar.index(target)
Gamma[rowIndex, colIndex:colIndex+isoCnt] = 1
return Gamma
def buildPhi(xvar, covs):
""" Phi is the 0,1 variance covariance matrix of the exogenous variables
(xvar). So it is a square matrix where the number or rows and columns is
equal to the number of xvars.
"""
# flatten yvar and xvar to remove isoform grouping
fxvar = utils.flatten_list(xvar, 1)
# Initialize Phi as all 0's with 1's along the diagonal.
nxvar = len(fxvar)
Phi = np.eye(nxvar)
# Iterate through the covariance list. Find each genes in the pair is
# located in phi and change covariances to 1. Remember that Phi is a
# symmetric matrix so you need to change both locations
for cov in covs:
coord = []
for gene in cov:
coord.append(fxvar.index(gene))
Phi[coord[0], coord[1]] = 1
Phi[coord[1], coord[0]] = 1
return Phi
def createPath(fname):
paths, covs = parsePathFile(fname)
yvar = identifyEndogenous(paths)
xvar = identifyExogenous(yvar, paths)
beta = buildBeta(yvar, paths)
gamma = buildGamma(yvar, xvar, paths)
phi = buildPhi(xvar, covs)
graph = semnet.SemPath(xvar, yvar, beta, gamma, phi)
return graph
if __name__ == '__main__':
pass