-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.py
More file actions
106 lines (91 loc) · 4.37 KB
/
load_data.py
File metadata and controls
106 lines (91 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Load training data into dataframes.
"""
import pandas as pd
from feature_selection import univariate
class LoadData(object):
"""A class to load and hold the training data.
"""
def __init__(self,
feature_selection=True,
clinical_path='data/tidy/train_cli.csv',
proteomic_path='data/tidy/train_pro.csv',
rna_path='data/tidy/train_rna.csv',
mismatch_path='data/tidy/sum_tab_1.csv',
test_proteomic_path='data/raw/test_pro.tsv',
test_clinical_path='data/raw/test_cli.tsv',
train_rna_path='data/raw/train_rna.tsv',
test_rna_path='data/raw/test_rna.tsv',
mislabel_path='data/tidy/sum_tab_2.csv'):
"""Load the training data into pandas DataFrames.
Keyword Arguments:
clinical_path {str} -- The path to the clinical data.
(default: {'data/tidy/train_cli.csv'})
proteomic_path {str} -- The path to the proteomic data.
Note that this will be normalized. (default:
{'data/tidy/train_pro.csv'})
mismatch_path {str} -- The path to the mismatch data.
(default: {'data/tidy/sum_tab_1.csv'})
test_proteomic_path {str}
"""
self.clinical = pd.read_csv(clinical_path, index_col=0)
self.proteomic = self.preprocess(
pd.read_csv(proteomic_path, index_col=0)
)
self.rna = self.preprocess(
pd.read_csv(rna_path, index_col=0)
)
self.mismatch = pd.read_csv(mismatch_path, index_col=0)
self.test_proteomic = self.preprocess(
pd.read_csv(test_proteomic_path, index_col=0, sep='\t').T
)
self.test_rna = self.preprocess(
pd.read_csv(test_rna_path, index_col=0, sep='\t').T
)
self.test_clinical = pd.read_csv(test_clinical_path, index_col=0, sep='\t')
self.train_rna = pd.read_csv(train_rna_path, index_col=0, sep='\t').T
self.train_pro_rna = self.train_rna.merge(self.proteomic, how='outer', left_index=True, right_index=True)
self.test_pro_rna = self.test_rna.merge(self.test_proteomic, how='outer', left_index=True, right_index=True)
self.train_all = self.train_pro_rna.merge(self.clinical, how='outer', left_index=True, right_index=True)
self.train_all = self.train_all.replace(['Female', 'Male','MSI-Low/MSS', 'MSI-High'], [0, 1, 0, 1])
self.test_all = self.test_pro_rna.merge(self.test_clinical, how='outer', left_index=True, right_index=True)
self.test_all = self.test_all.replace(['Female', 'Male', 'MSI-Low/MSS', 'MSI-High'], [0, 1, 0, 1])
self.mislabel = pd.read_csv(mislabel_path, index_col=0)
if feature_selection:
self.select_features()
# create training labels for if a sample has been mislabeled
self.mislabel_labels = []
for i in range(0, len(self.mislabel.index)):
if self.mislabel.iloc[i, 0] == self.mislabel.iloc[i, 1] and self.mislabel.iloc[i, 1] == self.mislabel.iloc[i, 2]:
self.mislabel_labels.append(0)
else:
self.mislabel_labels.append(1)
def select_features(self):
self.rna = univariate(self.rna, self.clinical)
self.proteomic = univariate(self.proteomic, self.clinical)
self.test_rna = self.test_rna[self.rna.columns]
self.test_proteomic = self.test_proteomic[self.proteomic.columns]
def preprocess(self, df):
return self.normalize(self.fix_data(df))
def normalize(self, df):
"""Normalize each column into roughly [-1.0, 1.0] centered around 0.0.
Arguments:
df {pandas.DataFrame} -- The data to normalize. Each column
must be quantitative.
Returns:
pandas.DataFrame -- The normalized data.
"""
return (df - df.mean()) / (df.max() - df.min())
def fix_data(self, df):
"""Preprocess dataframe to fill NaNs with 0s and remove bad
columns.
Arguments:
df {pandas.DataFrame} -- DataFrame to be processed.
Returns:
pandas.DataFrame -- Processed dataframe. Note that some columns
may be removed.
"""
return df.dropna(axis='columns', how='all').fillna(0.0)
if __name__ == "__main__":
data = LoadData()
print(data.rna)