ML-AutoTrainer-Engine/app.py at main · OthmanMohammad/ML-AutoTrainer-Engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import streamlit as st
from src import data_processing, streamlit_utils, projects, predictions
from src.data_recorder import DataProcessingRecorder
import pandas as pd
from src.data_processing_pipeline import DataProcessingPipeline
import json
import os

# Initialize the recorder in Streamlit's state if it's not already present
if 'data_processing_recorder' not in st.session_state:
    st.session_state['data_processing_recorder'] = DataProcessingRecorder()

# Initialize the uploaded file name in Streamlit's state if it's not already present
if 'uploaded_file_name' not in st.session_state:
    st.session_state['uploaded_file_name'] = None

def main():
    st.title("ML AutoTrainer Enginer")

    with st.sidebar:
        st.header('Projects')
        projects.add_project_form()
        selected_project = st.selectbox('Select a project', [''] + projects.get_project_names())
        if selected_project:
            st.session_state.selected_project = selected_project
            if st.button('Load Project Data'):
                loaded_data = projects.load_project_data(selected_project)
                if loaded_data is not None:
                    # Make sure loaded_data is a DataFrame before assigning
                    if isinstance(loaded_data, tuple):
                        loaded_data = loaded_data[0]  # Assuming the first element of the tuple is the DataFrame
                    st.session_state.processed_data = loaded_data
                    st.success('Project data loaded successfully.')
                    # Load recorded steps if exist
                    steps = projects.load_processing_steps(selected_project)
                    if steps:
                        st.session_state.data_processing_recorder.load_steps(steps)
                else:
                    st.error('No processed data found for this project, please upload new data.')

    if 'selected_project' in st.session_state and st.session_state.selected_project:
        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

        if uploaded_file:
            # Check if the uploaded file is different from the previously processed file
            if st.session_state.uploaded_file_name != uploaded_file.name:
                st.session_state.uploaded_file_name = uploaded_file.name
                data = data_processing.read_csv(uploaded_file)
                st.session_state.processed_data = data.copy()
                projects.save_project_data(st.session_state.selected_project, data)

        # Check if processed_data is in session state and is a DataFrame and not empty
        if ('processed_data' in st.session_state and
                isinstance(st.session_state.processed_data, pd.DataFrame) and
                not st.session_state.processed_data.empty):
            st.write(st.session_state.processed_data.head())

            target_column = streamlit_utils.select_target_column(st.session_state.processed_data)
            st.write(f"Selected Target Column: {target_column}")

            with st.expander("Drop Columns"):
                columns_to_drop = st.multiselect("Select columns to drop", st.session_state.processed_data.columns)
                if columns_to_drop and st.button("Drop Selected Columns"):
                    st.session_state.processed_data = data_processing.drop_columns(st.session_state.processed_data, columns_to_drop)
                    # Record the step
                    st.session_state.data_processing_recorder.record_step('drop_columns', columns=columns_to_drop)
                    projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
                    st.write(st.session_state.processed_data.head())

            with st.expander("Handle Missing Values"):
                strategy_map = {
                    "Drop Rows": "drop",
                    "Fill with Mean": "mean",
                    "Fill with Median": "median",
                    "Fill with Mode": "mode"
                }
                strategy = st.radio("Choose a strategy", list(strategy_map.keys()))
                if st.button("Apply Strategy"):
                    strategy_code = strategy_map[strategy]
                    st.session_state.processed_data = data_processing.handle_missing_values(st.session_state.processed_data, strategy_code)
                    # Record the step
                    st.session_state.data_processing_recorder.record_step('handle_missing_values', strategy=strategy_code)
                    projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
                    st.write(st.session_state.processed_data.head())

            with st.expander("Convert Categorical Columns to Numerical"):
                columns_to_convert = st.multiselect("Select columns to convert", st.session_state.processed_data.select_dtypes(include=['object']).columns)
                if columns_to_convert and st.button("Convert"):
                    st.session_state.processed_data = data_processing.convert_categorical_to_numerical(st.session_state.processed_data, columns_to_convert)
                    # Record the step
                    st.session_state.data_processing_recorder.record_step('convert_categorical_to_numerical', columns=columns_to_convert)
                    projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
                    st.write(st.session_state.processed_data.head())

            with st.expander("Feature Extraction"):
                extracted_data, extraction_params = streamlit_utils.feature_extraction_ui(
                    st.session_state.processed_data, target_column=target_column)
                if extracted_data is not None:
                    st.session_state.processed_data = extracted_data
                    # Record the step with the extraction method and parameters
                    st.session_state.data_processing_recorder.record_step('feature_extraction', **extraction_params)
                    projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
                    st.dataframe(st.session_state.processed_data)

            with st.expander("Filter Data"):
                filtered_data = streamlit_utils.apply_filters(st.session_state.processed_data)
                if filtered_data is not None:
                    st.session_state.processed_data = filtered_data
                    # Record the step
                    st.session_state.data_processing_recorder.record_step('filter_data')
                    projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
                st.dataframe(st.session_state.processed_data)

            with st.expander("Model Training"):
                if 'selected_project' in st.session_state and st.session_state.selected_project:
                    results = streamlit_utils.train_model_ui(st.session_state.processed_data, target_column, st.session_state.selected_project)
                    if results:
                        st.write(results)
                        # Record the step
                        st.session_state.data_processing_recorder.record_step('model_training', target_column=target_column)


            with st.expander("Apply Saved Pipeline to New Data"):
                new_data_file = st.file_uploader("Upload New Data CSV File", type="csv")
                target_column = st.text_input("Enter the name of the target column (it will be excluded from processing):")

                if new_data_file and target_column and 'selected_project' in st.session_state and st.session_state.selected_project:
                    # Define the path to the steps file
                    steps_file_path = os.path.join('projects', st.session_state.selected_project, 'processing_steps.json')

                    # Check if the steps file exists
                    if os.path.exists(steps_file_path):
                        # Load the steps from the JSON file
                        with open(steps_file_path, 'r') as file:
                            steps = json.load(file)

                        # Create a new DataProcessingPipeline instance
                        pipeline = DataProcessingPipeline()

                        # Load the new data
                        new_data = pd.read_csv(new_data_file)

                        # Remove the target column before applying the pipeline
                        if target_column in new_data.columns:
                            new_data = new_data.drop(columns=[target_column])

                        # Apply each step in the pipeline
                        try:
                            for step in steps:
                                step_name = step["step"]
                                parameters = step.get("parameters", {})
                                if hasattr(pipeline, step_name):
                                    # Get the function associated with the step name
                                    step_function = getattr(pipeline, step_name)
                                    # Call the function with the arguments from the steps file
                                    new_data = step_function(new_data, **parameters)
                                else:
                                    st.error(f"Step '{step_name}' is not a method of DataProcessingPipeline")

                            st.success('Pipeline applied successfully.')
                            st.write(new_data.head())

                            # Save the processed new data to a file
                            processed_new_data_path = os.path.join('projects', st.session_state.selected_project, 'processed_new_data.csv')
                            new_data.to_csv(processed_new_data_path, index=False)
                            st.session_state['processed_new_data_path'] = processed_new_data_path
                            st.success(f"Processed new data saved to {processed_new_data_path}")

                            # Add a button to download the processed new data
                            with open(processed_new_data_path, "rb") as f:
                                # Read the saved processed data file
                                processed_new_data = f.read()
                                st.download_button(
                                    label="Download Processed Data as CSV",
                                    data=processed_new_data,
                                    file_name="processed_new_data.csv",
                                    mime="text/csv",
                                    key='download-csv'
                                )

                        except Exception as e:
                            st.error(f"An error occurred: {e}")

                    else:
                        st.error(f"Steps file does not exist: {steps_file_path}")


            with st.expander("Model Prediction"):
                # Path to the trained model
                model_path = os.path.join('projects', st.session_state.selected_project, 'model.joblib')
                # Path where the predictions should be saved
                predictions_path = os.path.join('projects', st.session_state.selected_project, 'predictions.csv')

                # Load the model
                try:
                    model = predictions.load_model(model_path)
                    st.success("Model loaded successfully.")
                except Exception as e:
                    st.error(f"An error occurred while loading the model: {e}")
                    model = None

                # Load the processed data for predictions
                if 'processed_new_data_path' in st.session_state:
                    try:
                        data_for_prediction = pd.read_csv(st.session_state['processed_new_data_path'])
                        st.write("Data for prediction loaded successfully.")
                        st.write(data_for_prediction.head())
                    except Exception as e:
                        st.error(f"An error occurred while loading the data for prediction: {e}")
                        data_for_prediction = None
                else:
                    st.warning("No processed data available for predictions. Please apply the pipeline to new data first.")
                    data_for_prediction = None

                # Make predictions and save results
                if model and data_for_prediction is not None:
                    if st.button("Make Predictions"):
                        try:
                            preds = predictions.predict(model, data_for_prediction)
                            st.success("Predictions made successfully.")

                            # Here I am not writing preds directly because I want to show them with the features
                            combined_results = data_for_prediction.copy()
                            combined_results['Predictions'] = preds
                            st.write(combined_results.head())

                            # Save the predictions along with the features
                            if predictions.save_predictions(data_for_prediction, preds, predictions_path):
                                st.success(f"Predictions saved to {predictions_path}")

                                # Create a link to download the predictions CSV
                                with open(predictions_path, "rb") as file:
                                    btn = st.download_button(
                                        label="Download Predictions CSV",
                                        data=file,
                                        file_name="predictions.csv",
                                        mime="text/csv",
                                    )
                        except Exception as e:
                            st.error(f"An error occurred while making predictions: {e}")


            # Save the processing steps to the project after each interaction
            projects.save_processing_steps(st.session_state.selected_project, st.session_state.data_processing_recorder.save_steps())

if __name__ == "__main__":
    main()