-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathLengthy_Annotations.py
More file actions
101 lines (81 loc) · 3.27 KB
/
Lengthy_Annotations.py
File metadata and controls
101 lines (81 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import sys
import os
import math
import re
from io import StringIO
#Reading files
def listAndReadFiles(inputFolder,multiplier):
data_to_write = pd.DataFrame()
for root, dirs, files in os.walk(inputFolder,topdown=True):
k = 1
for name in files:
path = os.path.join(root,name)
print(path)
dataset = pd.read_csv(path,header = None,encoding="utf-8",sep= ",")
dataOneFile = clean_file(dataset,multiplier)
data_to_write = data_to_write.append(dataOneFile)
return data_to_write
def oneShotReadAll(inputFolder,multiplier):
data_to_write = pd.DataFrame()
for root, dirs, files in os.walk(inputFolder,topdown=True):
k = 1
for name in files:
path = os.path.join(root,name)
#print(path)
dataset = pd.read_csv(path,header = None,encoding="utf-8",sep= ",")
data_to_write = data_to_write.append(dataset)
dataOneFile = clean_file(data_to_write,multiplier)
return dataOneFile
#writing to output file
def writeFiles(dataframe,outputFile):
dataframe.to_csv(outputFile, header=None, index=None, sep=',', mode='a')
def clean_file(dataset,multiplier):
columns_name = dataset.columns
dataset[5] = dataset[2]-dataset[1]
print("lenght of dataset is : ", len(dataset[1]))
#calculating the mean for annotations and standard deviation
annotation_std = dataset.groupby(dataset[0])[5].describe()
#setting the keys for annotations
primary_key = annotation_std.index
new_data_list = []
std_keys = {}
mean_keys = {}
for key in primary_key:
mean_keys[key] = annotation_std.loc[key]["mean"]
std_keys[key] = annotation_std.loc[key]["std"]
for index,rows in dataset.iterrows():
for key in primary_key:
try:
#setting the upper and lowerBound
upperBound = mean_keys[key]+std_keys[key]*float(multiplier)
lowerBound = mean_keys[key]-std_keys[key]*float(multiplier)
if(rows[0]==key and ((rows[5]> upperBound) or (rows[5]< lowerBound))):
new_data_list.append(rows)
except:
# for handling the NAN values
new_data_list.append(rows)
# dropping the column containing average
new_data_frame = pd.DataFrame(new_data_list)
try:
new_data_frame = new_data_frame.drop([5],axis=1).reset_index()
except:
pass
return new_data_frame
if __name__ == "__main__":
if len(sys.argv) != 3:
sys.stderr.write("Usage : python %s inputfiledirectory outputfileDirectory\n" % sys.argv[0])
raise SystemExit(1)
inputFolder = sys.argv[1]
outputFile = sys.argv[2]
std = input("enter the required standard deviation")
temp_test = ""
data_to_write = listAndReadFiles(inputFolder,std)
writeFiles(data_to_write,outputFile)
reply = input("do you want to do the whole stuff in one DataFrame? type YES or NO")
if(reply.casefold() == "yes"):
data1_to_write = oneShotReadAll(inputFolder,std)
outputFile=outputFile+"fullstuff"+".csv"
writeFiles(data1_to_write,outputFile)
else:
exit(0)