-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathProject-Pipeline.py
More file actions
99 lines (77 loc) · 2.78 KB
/
Project-Pipeline.py
File metadata and controls
99 lines (77 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# coding: utf-8
# IMPORT DEPENDENCIES
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.externals import joblib
import streamlit as st
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
# DATA AND GLOBAL VARIABLES
df = pd.read_csv('clientfinaldropped.csv').fillna(0)
models = [MultinomialNB, KNeighborsClassifier, RandomForestClassifier]
# Visualize the Class balance using a bar chart
st.title("Frequency of Deadbeats")
plt.hist(df['Type'], bins = 2, rwidth = 0.5,)
st.pyplot()
# DEFINE FUNCTIONS
def rescale_numbers(df, scaler):
for col in df:
if df[col].dtype in ['int64', 'float64']:
numbers = df[col].astype(float).values.reshape(-1,1)
df[col] = scaler().fit_transform(numbers)
return df
def preprocess(df):
return (df
.pipe(rescale_numbers, MinMaxScaler)
)
def train_test(df, target):
return train_test_split(
df[[col for col in df if col != target]],
df[target],
test_size = .2,
random_state = 42
)
def evaluate_model(algorithm, train_test):
train_X, test_X, train_y, test_y = train_test
model = algorithm().fit(train_X, train_y)
pred_proba_y = model.predict_proba(test_X)
auc = roc_auc_score(test_y, pred_proba_y[:, 1])
st.subheader('Area under the Curve Score')
st.write(auc)
false_pos_rates, true_pos_rates, _ = roc_curve(test_y, pred_proba_y[:, 1])
st.subheader('Area under the Curve Graph for this KFold Sample')
plt.plot(false_pos_rates, true_pos_rates)
st.pyplot()
score = model.score(test_X, test_y)
st.write(f"Accuracy: {round(score, 2)}")
return model , score
def k_fold(df, target):
features = df[[col for col in df if col != target]]
target = df[target]
kf = StratifiedKFold(n_splits = 5, random_state = 42)
for model in models:
st.title(model)
for train_i, test_i in kf.split(features, target):
scores = []
scores.append(
evaluate_model(
model,
(features.iloc[train_i],
features.iloc[test_i],
target.iloc[train_i],
target.iloc[test_i])
)[1])
st.title("Average Model Score")
st.write(sum(scores) / len(scores))
# call the functions
k_fold(preprocess(df), target = 'Type')