Learning-ML/regression_tool.py at main · phsadhankar/Learning-ML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt

def main():
    st.title("Regression Analysis Tool")
    st.write("Upload your dataset and choose the regression algorithm to get started.")

    if st.button("Return to Main Page"):
        st.session_state.page = "main"
        st.experimental_rerun()

    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")

    if uploaded_file is not None:
        if st.sidebar.button("Generate Comparative Report"):
            st.session_state.page = "make_report"
            st.experimental_rerun()

        df = pd.read_csv(uploaded_file)
        st.write("Dataset Preview:")
        st.write(df.head())

        independent_columns = st.sidebar.multiselect("Select independent (predictor) variables:", options=df.columns.tolist())
        dependent_column = st.sidebar.selectbox("Select dependent (target) variable:", options=df.columns.tolist())

        if not independent_columns:
            st.warning("Please select at least one independent (predictor) variable.")
            return

        if not dependent_column:
            st.warning("Please select the dependent (target) variable.")
            return

        categorical_option = st.sidebar.radio(
            "Do you have any categorical variables?",
            ('No', 'Yes')
        )

        if categorical_option == 'Yes':
            categorical_columns = st.sidebar.multiselect("Select categorical columns:", options=independent_columns)
        else:
            categorical_columns = []

        # Function to handle categorical variables
        def handle_categorical(df, categorical_columns):
            df_processed = df.copy()
            for col in categorical_columns:
                df_processed[col] = pd.Categorical(df_processed[col]).codes
            return df_processed

        # Function to split the dataset
        def split(df, test_size, dependent_column):
            X = df.drop(dependent_column, axis=1)
            y = df[dependent_column]
            return train_test_split(X, y, test_size=test_size, random_state=42)

        # Function to show the split
        def show_split(x_train, x_test, test_size):
            st.write(f"Training set: {len(x_train)} samples")
            st.write(f"Test set: {len(x_test)} samples ({test_size*100}% of the data)")

        # Function to calculate adjusted R2 score
        def adjusted_r2(r2, n, k):
            if n <= k + 1:
                return float('nan')  # Adjusted R² is undefined
            return 1 - (((1 - r2) * (n - 1)) / (n - k - 1))

        # Store results for the report
        regression_results = {}

        algo = st.sidebar.selectbox("Select Regression Algorithm:",
                            ['Linear Regression',
                             'Polynomial Regression',
                             'Support Vector Regression (SVR)',
                             'Decision Tree Regression',
                             'Random Forest Regression'])

        df_processed = handle_categorical(df, categorical_columns)

        if algo == 'Linear Regression':
            st.write(f"<h2><b><font>Linear Regression</b></h2>", unsafe_allow_html=True)
            test_size = st.sidebar.slider("Test size", 0.1, 0.5, 0.2, 0.01, key="test_size_lr")

            x_train, x_test, y_train, y_test = split(df_processed, test_size, dependent_column)
            show_split(x_train, x_test, test_size)

            lr = LinearRegression()
            lr.fit(x_train[independent_columns], y_train)
            y_predictions = lr.predict(x_test[independent_columns])

            st.header("Linear Regression Results: ")

            c1, c2, c3 = st.columns(3)

            with c1:
                r2_scoree = r2_score(y_test, y_predictions)
                st.write(f"<b>R2 score is: <br><mark> {r2_scoree:.6f} </mark></b>", unsafe_allow_html=True)
            with c2:
                adjusted_r2_score = adjusted_r2(r2_scoree, len(x_test), len(independent_columns))
                st.write(f"<b>Adjusted R2 score is: <mark> {adjusted_r2_score:.6f} </mark></b>", unsafe_allow_html=True)
            with c3:
                mean_squared_error_ = mean_squared_error(y_test, y_predictions)
                st.write(f"<b>Root mean squared error is: <mark> {np.sqrt(mean_squared_error_):.6f} </mark></b>", unsafe_allow_html=True)

            st.subheader("Regression Plot")
            e = st.expander("")
            x_bins = e.number_input("x_bins", 10, 100, 10)
            fig, ax = plt.subplots()
            sns.regplot(x=y_test, y=y_predictions, robust=True, color='blue', x_bins=x_bins, ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            st.subheader("Residual Plot")
            e = st.expander("")
            fig, ax = plt.subplots()
            sns.residplot(x=y_test, y=y_predictions, robust=True, color='blue', ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            if st.button("Save to Report", key="save_lr"):
                regression_results['Linear Regression'] = {
                    'R2 Score': r2_scoree,
                    'Adjusted R2 Score': adjusted_r2_score,
                    'Root Mean Squared Error': np.sqrt(mean_squared_error_),
                    'Regression Plot': fig,
                    'Residual Plot': fig
                }

        if algo == 'Polynomial Regression':
            st.write(f"<h2><b><font>Polynomial Regression</b></h2>", unsafe_allow_html=True)
            degree = st.sidebar.slider("Degree of the polynomial features:", 2, 5, 2, key="degree_pr")
            test_size = st.sidebar.slider("Test size", 0.1, 0.5, 0.2, 0.01, key="test_size_pr")

            x_train, x_test, y_train, y_test = split(df_processed, test_size, dependent_column)
            show_split(x_train, x_test, test_size)

            poly_features = PolynomialFeatures(degree=degree)
            x_train_poly = poly_features.fit_transform(x_train[independent_columns])
            x_test_poly = poly_features.fit_transform(x_test[independent_columns])

            pr = LinearRegression()
            pr.fit(x_train_poly, y_train)
            y_predictions = pr.predict(x_test_poly)

            st.header("Polynomial Regression Results: ")

            c1, c2, c3 = st.columns(3)

            with c1:
                r2_scoree = r2_score(y_test, y_predictions)
                st.write(f"<b>R2 score is: <mark> {r2_scoree:.6f} </mark></b>", unsafe_allow_html=True)
            with c2:
                adjusted_r2_score = adjusted_r2(r2_scoree, len(x_test), len(independent_columns))
                st.write(f"<b>Adjusted R2 score is: <mark> {adjusted_r2_score:.6f} </mark></b>", unsafe_allow_html=True)
            with c3:
                mean_squared_error_ = mean_squared_error(y_test, y_predictions)
                st.write(f"<b>Root mean squared error is: <mark> {np.sqrt(mean_squared_error_):.6f} </mark></b>", unsafe_allow_html=True)

            st.subheader("Regression Plot")
            e = st.expander("")
            x_bins = e.number_input("x_bins", 10, 100, 10)
            fig, ax = plt.subplots()
            sns.regplot(x=y_test, y=y_predictions, robust=True, color='blue', x_bins=x_bins, ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            st.subheader("Residual Plot")
            e = st.expander("")
            fig, ax = plt.subplots()
            sns.residplot(x=y_test, y=y_predictions, robust=True, color='blue', ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            if st.button("Save to Report", key="save_pr"):
                regression_results['Polynomial Regression'] = {
                    'R2 Score': r2_scoree,
                    'Adjusted R2 Score': adjusted_r2_score,
                    'Root Mean Squared Error': np.sqrt(mean_squared_error_),
                    'Regression Plot': fig,
                    'Residual Plot': fig
                }

        if algo == 'Support Vector Regression (SVR)':
            st.write(f"<h2><b><font>Support Vector Regression (SVR)</b></h2>", unsafe_allow_html=True)
            kernel = st.sidebar.selectbox("Select SVR kernel:", ['linear', 'poly', 'rbf', 'sigmoid'], key="kernel_svr")
            test_size = st.sidebar.slider("Test size", 0.1, 0.5, 0.2, 0.01, key="test_size_svr")

            x_train, x_test, y_train, y_test = split(df_processed, test_size, dependent_column)
            show_split(x_train, x_test, test_size)

            scaler = StandardScaler()
            x_train_scaled = scaler.fit_transform(x_train[independent_columns])
            x_test_scaled = scaler.transform(x_test[independent_columns])

            svr = SVR(kernel=kernel)
            svr.fit(x_train_scaled, y_train)
            y_predictions = svr.predict(x_test_scaled)

            st.header("SVR Results: ")

            c1, c2, c3 = st.columns(3)

            with c1:
                r2_scoree = r2_score(y_test, y_predictions)
                st.write(f"<b>R2 score is: <mark> {r2_scoree:.6f} </mark></b>", unsafe_allow_html=True)
            with c2:
                adjusted_r2_score = adjusted_r2(r2_scoree, len(x_test), len(independent_columns))
                st.write(f"<b>Adjusted R2 score is: <mark> {adjusted_r2_score:.6f} </mark></b>", unsafe_allow_html=True)
            with c3:
                mean_squared_error_ = mean_squared_error(y_test, y_predictions)
                st.write(f"<b>Root mean squared error is: <mark> {np.sqrt(mean_squared_error_):.6f} </mark></b>", unsafe_allow_html=True)

            st.subheader("Regression Plot")
            e = st.expander("")
            x_bins = e.number_input("x_bins", 10, 100, 10)
            fig, ax = plt.subplots()
            sns.regplot(x=y_test, y=y_predictions, robust=True, color='blue', x_bins=x_bins, ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            st.subheader("Residual Plot")
            e = st.expander("")
            fig, ax = plt.subplots()
            sns.residplot(x=y_test, y=y_predictions, robust=True, color='blue', ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            if st.button("Save to Report", key="save_svr"):
                regression_results['Support Vector Regression'] = {
                    'R2 Score': r2_scoree,
                    'Adjusted R2 Score': adjusted_r2_score,
                    'Root Mean Squared Error': np.sqrt(mean_squared_error_),
                    'Regression Plot': fig,
                    'Residual Plot': fig
                }

        if algo == 'Decision Tree Regression':
            st.write(f"<h2><b><font>Decision Tree Regression</b></h2>", unsafe_allow_html=True)
            test_size = st.sidebar.slider("Test size", 0.1, 0.5, 0.2, 0.01, key="test_size_dtr")

            x_train, x_test, y_train, y_test = split(df_processed, test_size, dependent_column)
            show_split(x_train, x_test, test_size)

            dtr = DecisionTreeRegressor(random_state=42)
            dtr.fit(x_train[independent_columns], y_train)
            y_predictions = dtr.predict(x_test[independent_columns])

            st.header("Decision Tree Regression Results: ")

            c1, c2, c3 = st.columns(3)

            with c1:
                r2_scoree = r2_score(y_test, y_predictions)
                st.write(f"<b>R2 score is: <mark> {r2_scoree:.6f} </mark></b>", unsafe_allow_html=True)
            with c2:
                adjusted_r2_score = adjusted_r2(r2_scoree, len(x_test), len(independent_columns))
                st.write(f"<b>Adjusted R2 score is: <mark> {adjusted_r2_score:.6f} </mark></b>", unsafe_allow_html=True)
            with c3:
                mean_squared_error_ = mean_squared_error(y_test, y_predictions)
                st.write(f"<b>Root mean squared error is: <mark> {np.sqrt(mean_squared_error_):.6f} </mark></b>", unsafe_allow_html=True)

            st.subheader("Decision Tree Plot")
            fig, ax = plt.subplots(figsize=(10, 10))
            plot_tree(dtr, feature_names=independent_columns, filled=True, ax=ax)
            st.pyplot(fig)

            if st.button("Save to Report", key="save_dtr"):
                regression_results['Decision Tree Regression'] = {
                    'R2 Score': r2_scoree,
                    'Adjusted R2 Score': adjusted_r2_score,
                    'Root Mean Squared Error': np.sqrt(mean_squared_error_),
                    'Decision Tree Plot': fig
                }

        if algo == 'Random Forest Regression':
            st.write(f"<h2><b><font>Random Forest Regression</b></h2>", unsafe_allow_html=True)
            n_estimators = st.sidebar.slider("Number of trees in the forest:", 10, 100, 10, key="n_estimators_rfr")
            test_size = st.sidebar.slider("Test size", 0.1, 0.5, 0.2, 0.01, key="test_size_rfr")

            x_train, x_test, y_train, y_test = split(df_processed, test_size, dependent_column)
            show_split(x_train, x_test, test_size)

            rfr = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
            rfr.fit(x_train[independent_columns], y_train)
            y_predictions = rfr.predict(x_test[independent_columns])

            st.header("Random Forest Regression Results: ")

            c1, c2, c3 = st.columns(3)

            with c1:
                r2_scoree = r2_score(y_test, y_predictions)
                st.write(f"<b>R2 score is: <mark> {r2_scoree:.6f} </mark></b>", unsafe_allow_html=True)
            with c2:
                adjusted_r2_score = adjusted_r2(r2_scoree, len(x_test), len(independent_columns))
                st.write(f"<b>Adjusted R2 score is: <mark> {adjusted_r2_score:.6f} </mark></b>", unsafe_allow_html=True)
            with c3:
                mean_squared_error_ = mean_squared_error(y_test, y_predictions)
                st.write(f"<b>Root mean squared error is: <mark> {np.sqrt(mean_squared_error_):.6f} </mark></b>", unsafe_allow_html=True)

            st.subheader("Regression Plot")
            e = st.expander("")
            x_bins = e.number_input("x_bins", 10, 100, 10)
            fig, ax = plt.subplots()
            sns.regplot(x=y_test, y=y_predictions, robust=True, color='blue', x_bins=x_bins, ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            st.subheader("Residual Plot")
            e = st.expander("")
            fig, ax = plt.subplots()
            sns.residplot(x=y_test, y=y_predictions, robust=True, color='blue', ax=ax)
            e.pyplot(fig=fig, clear_figure=None)

            if st.button("Save to Report", key="save_rfr"):
                regression_results['Random Forest Regression'] = {
                    'R2 Score': r2_scoree,
                    'Adjusted R2 Score': adjusted_r2_score,
                    'Root Mean Squared Error': np.sqrt(mean_squared_error_),
                    'Regression Plot': fig,
                    'Residual Plot': fig
                }

        # Save the regression results to session state
        st.session_state['regression_results'] = regression_results

if __name__ == "__main__":
    main()