-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcensus_visualization.py
More file actions
67 lines (53 loc) · 1.9 KB
/
census_visualization.py
File metadata and controls
67 lines (53 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import seaborn as sea
from ucimlrepo import fetch_ucirepo
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.decomposition import PCA
adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets['income']
numerical_features = ['age',
'fnlwgt',
'education-num',
'capital-gain',
'capital-loss',
'hours-per-week'
]
categorical_features = ['workclass',
'education',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'native-country'
]
def encode(x):
if "<=50K" in x:
return 0
elif ">50K" in x:
return 1
y = np.array(y.apply(encode))
# Show class imbalance
sea.histplot(y,bins=[0,0.5,1]);plt.show()
print(f"Proportion of >50K samples to <=50K samples: {sum(y)/len(y):.2f}")
# Show age distribution
sea.histplot(x=X['age']);plt.show()
print(f"Mean age: {sum(X['age'])/X.shape[0]:.2f}")
##### Scale the features and apply PCA #####
# PCA example taken from:
#https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html#sphx-glr-auto-examples-decomposition-plot-pca-iris-py
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X[numerical_features])
X_numerical_scaled = pd.DataFrame(X_numerical_scaled,
columns=numerical_features)
pca = PCA(n_components=2)
pca_X = pca.fit_transform(X_numerical_scaled)
plt.scatter(x=pca_X[:,0],y=pca_X[:,1],c=y)
plt.show()
# Finally, do a seaborn pairplot of the numerical features
sea.pairplot(X_numerical_scaled);plt.show()