-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreProcessor.py
More file actions
62 lines (51 loc) · 2.27 KB
/
PreProcessor.py
File metadata and controls
62 lines (51 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import sys
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_bool_dtype
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from pprint import pprint
usage = 'PreProcessor.py /path/to/raw/dataset.csv /path/to/processed/dataset.csv'
if len(sys.argv) < 3:
print('not enough arguments\n')
print(usage)
sys.exit(1)
if len(sys.argv) > 3:
print('too many arguments\n')
print(usage)
sys.exit(1)
def main(argv):
input_data_path = Path(argv[1])
output_data_path = Path(argv[2])
data = pd.read_csv(input_data_path, header=None)
#clean missing and nulls and '?'s
data.replace(to_replace=[r'\s*\?\s*'], value=[np.nan], regex=True, inplace=True)
data.dropna(axis=0, how='any', inplace=True)
standard_scaler = StandardScaler()
label_binarizer = LabelBinarizer()
data.rename(index=str, columns={data.columns[-1]: 'classifier'}, inplace=True)
#normalize data
for column_name in data.columns:
column = data[column_name]
if is_numeric_dtype(column):
data[column_name] = standard_scaler.fit_transform(data[[column_name]].as_matrix())
elif is_string_dtype(column):
encoded_array = label_binarizer.fit_transform(column)
#remove old categorical columns
data.drop(column_name, axis=1, inplace=True)
#add newly encoded columns to dataframe
new_columns_dict = {}
#construct dictionary with keys of new column names (indicies starting at 1 past the last index of dataframe)
# and values of the new encoded columns
for i in range(0, encoded_array.shape[1]):
encoded_column_name = str(column_name) + '=' + str(label_binarizer.classes_[i])
column_data = encoded_array.T[i]
new_columns_dict[encoded_column_name] = column_data
#add new columns to dataframe
data = data.assign(**new_columns_dict)
elif not is_bool_dtype(column):
raise TypeError('column is of unhandleable dtype: ', column.dtype)
with output_data_path.open(mode='w') as output_data_stream:
data.to_csv(output_data_stream, index=False)#, header=None)
return 0
main(sys.argv)