-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_data.py
More file actions
58 lines (52 loc) · 1.69 KB
/
extract_data.py
File metadata and controls
58 lines (52 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
variables = {}
names = []
name_map = {}
targets = []
target_value = "marital-status"
target_map = []
with open("adult\\adult.data", "r") as data:
lines = [x.strip("\n").strip(".").lower().split(", ") if x.strip("\n") != '' else None for x in data.readlines()]
with open("adult\\adult.names", "r") as data:
data = [x.strip("\n").strip(".").lower().split(": ") for x in data.readlines()]
for name, vals in data:
if name != target_value:
variables[name] = vals.split(", ")
names.append(name)
for v in variables[name]:
name_map[v] = name
else:
for val in vals.split(", "): target_map.append(val)
new_lines = []
for l in lines:
if l != None:
numeric = []
skip = False
for idx, field in enumerate(l):
n = name_map.get(field)
val = field
if n != None:
val = variables[n].index(field)
elif field in target_map:
if "?" in l:
skip = True
break
else:
targets.append(target_map.index(val))
continue
if val == "?":
skip = True
break
else:
val = float(val)
numeric.append(val)
if skip:
continue
new_lines.append(", ".join([str(x) for x in numeric]))
with open("census.td", "w") as data:
final_string = ""
for n in range(len(new_lines)):
final_string += str(", ".join(new_lines[n]))
final_string += " | "
final_string += str(targets[n])
final_string += "\n"
data.write(final_string)