-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhelper_EDA_model.py
More file actions
132 lines (111 loc) · 5.03 KB
/
helper_EDA_model.py
File metadata and controls
132 lines (111 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
'''
this function gives an overview about the percantage of NaN values in a dataframe
'''
def percantage_null(data):
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
print(nulls.sort_values('percentage', ascending = False))
'''
this function shows the distribution of the categorical values in the columns
input is a dataframe and the size of the plot
output are a plot for every column
'''
def plot_column_distribution(df, max_plots, plots_per_row):
nunique = df.nunique()
df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]]
max_plots, max_plots = df.shape
columnNames = list(df)
nGraphRow = (max_plots + plots_per_row - 1) / plots_per_row
plt.figure(num = None, figsize = (6 * plots_per_row, 8 * plots_per_row), dpi = 80, facecolor = 'w', edgecolor = 'k')
for i in range(min(max_plots, plots_per_row)):
plt.subplot(plots_per_row, plots_per_row, i + 1)
columnDf = df.iloc[:, i]
if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
valueCounts = columnDf.value_counts()
valueCounts.plot.bar()
else:
columnDf.hist()
plt.ylabel('counts')
plt.xticks(rotation = 90)
plt.title(f'{columnNames[i]} (column {i})')
plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
plt.show()
'''
this two functions shows the correlation of the categorical values in the input columns
input is a dataframe and the columns we want the correlations for
output are if the null hypothesis is rejected or not for every combination of columns
'''
def chi_square_execute(data, columns=[]):
for i in columns:
for j in columns:
if i != j:
chi_square(data, i, j)
def chi_square(data, m, n):
data_crosstab = pd.crosstab(data[m], data[n], margins=True, margins_name="Total")
# significance level
alpha = 0.05
# Calcualtion of Chisquare test statistics
chi_square = 0
rows = data[m].unique()
columns = data[n].unique()
for i in columns:
for j in rows:
O = data_crosstab[i][j]
E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
chi_square += (O-E)**2/E
print("\n--------------------------------------------------------------------------------------")
print("\n--------------------------------------------------------------------------------------")
print("H₀: column", m, " and column", n, "are independent, i.e. no relationship")
print("H₁: column", m, " and column", n, "are independent, i.e. ∃ a relationship")
print("α = 0.05")
# The p-value approach
print("The p-value approach: The p-value approach to hypothesis testing in the decision rule")
p_value = 1 - stats.norm.cdf(chi_square, (len(rows)-1)*(len(columns)-1))
conclusion = "Failed to reject the null hypothesis."
if p_value <= alpha:
conclusion = "Null Hypothesis is rejected."
print("chisquare-score is:", chi_square, " and p value is:", p_value)
print(conclusion)
# The critical value approach
print("The critical value approach: The critical value approach to hypothesis testing in the decision rule")
critical_value = stats.chi2.ppf(1-alpha, (len(rows)-1)*(len(columns)-1))
conclusion = "Failed to reject the null hypothesis."
if chi_square > critical_value:
conclusion = "Null Hypothesis is rejected."
print("chisquare-score is:", chi_square, " and p value is:", critical_value)
print(conclusion)
'''
this functions gives the boxcox transoformation to the input columns
input is a dataframe - the function select only the numerical columns
output are the transformed columns
'''
def boxcox_transform(data):
numeric_cols = data.select_dtypes(np.number).columns
_ci = {column: None for column in numeric_cols}
for column in numeric_cols:
data[column] = np.where(data[column]<=0, np.NAN, data[column])
data[column] = data[column].fillna(data[column].mean())
transformed_data, ci = stats.boxcox(data[column])
data[column] = transformed_data
_ci[column] = [ci]
return data, _ci
'''
this functions removes the outliers from the input columns
input is a dataframe, threshold, columns to remove outliers, columns to skip
output are the columns with removed outliers
'''
def remove_outliers(data, threshold=1.5, in_columns=[], skip_columns=[]):
for column in in_columns:
if column not in skip_columns:
upper = np.percentile(data[column],75)
lower = np.percentile(data[column],25)
iqr = upper - lower
upper_limit = upper + (threshold * iqr)
lower_limit = lower - (threshold * iqr)
data = data[(data[column]>lower_limit) & (data[column]<upper_limit)]
return data