-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataset.py
More file actions
78 lines (63 loc) · 2.86 KB
/
Dataset.py
File metadata and controls
78 lines (63 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from os import path
from urllib import request
from sklearn.model_selection import train_test_split
import csv
import numpy
import pandas
import ssl
import re
def getFromUrl(url, file_path):
ssl._create_default_https_context = ssl._create_unverified_context
if not path.exists(file_path):
request.urlretrieve(url, file_path)
def normalizeString(string):
string = re.sub(r'[\W]', ' ', re.sub(r'(?<!^)(?=[A-Z])', ' ', str(string)).lower())
return re.sub(r'[\s\_]+', '_', string)
def normalizeColumnNames(dataset):
columns = dataset.columns
new_dataset = pandas.DataFrame()
for c in dataset.columns:
new_dataset[normalizeString(c)] = dataset[c]
return new_dataset
def normalizeColumnValues(dataset, columns):
for c in columns:
dataset[c] = [normalizeString(v) for v in dataset[c]]
return dataset
def save(dataset, path):
with open(path, 'w') as f:
f.write('# {} {}\n'.format(len(dataset), len(dataset.columns) - 1))
dataset.to_csv(path, header=False, index=False, mode='a')
def split(dataset, size):
test_size = 1.0 - size
return train_test_split(dataset, test_size=test_size, random_state=42)
def exportColumns(dataset, path):
with open(path, 'w') as f:
writer = csv.writer(f)
writer.writerow(dataset.columns)
def binarizeByLessThanOrEqualTo(dataset, column, threshold):
dataset[column] = dataset[column].values.astype(numpy.float32)
dataset[column] = (dataset[column] <= threshold).astype(int)
return dataset
def binarizeByMean(dataset, columns):
dataset[columns] = dataset[columns].values.astype(numpy.float32)
dataset[columns] = (dataset[columns] < dataset[columns].mean()).astype(int)
return dataset
def binarizeByMedian(dataset, columns):
dataset[columns] = dataset[columns].values.astype(numpy.float32)
dataset[columns] = (dataset[columns] < dataset[columns].median()).astype(int)
return dataset
def normalize(dataset, columns):
dataset[columns] = dataset[columns].values.astype(numpy.float32)
dataset[columns] = (dataset[columns] - dataset[columns].min()) / (dataset[columns].max() - dataset[columns].min())
return dataset
def standardize(dataset, columns):
dataset[columns] = dataset[columns].values.astype(numpy.float32)
dataset[columns] = (dataset[columns] - dataset[columns].mean()) / dataset[columns].std()
return dataset
def oneHotEncoding(dataset, binary_columns, categorical_columns):
dataset = pandas.get_dummies(dataset, columns=binary_columns, drop_first=True)
return pandas.get_dummies(dataset, columns=categorical_columns, prefix_sep='=')
def selectLabelColumn(dataset, label_column):
return dataset[[label_column] + [c for c in dataset.columns if c != label_column]]
def numericalColumns(dataset, binary, categorical, exclude=[]):
return [c for c in dataset.columns if c not in categorical + binary + exclude]