-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
248 lines (209 loc) · 14.1 KB
/
app.py
File metadata and controls
248 lines (209 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import streamlit as st
from src import data_processing, streamlit_utils, projects, predictions
from src.data_recorder import DataProcessingRecorder
import pandas as pd
from src.data_processing_pipeline import DataProcessingPipeline
import json
import os
# Initialize the recorder in Streamlit's state if it's not already present
if 'data_processing_recorder' not in st.session_state:
st.session_state['data_processing_recorder'] = DataProcessingRecorder()
# Initialize the uploaded file name in Streamlit's state if it's not already present
if 'uploaded_file_name' not in st.session_state:
st.session_state['uploaded_file_name'] = None
def main():
st.title("ML AutoTrainer Enginer")
with st.sidebar:
st.header('Projects')
projects.add_project_form()
selected_project = st.selectbox('Select a project', [''] + projects.get_project_names())
if selected_project:
st.session_state.selected_project = selected_project
if st.button('Load Project Data'):
loaded_data = projects.load_project_data(selected_project)
if loaded_data is not None:
# Make sure loaded_data is a DataFrame before assigning
if isinstance(loaded_data, tuple):
loaded_data = loaded_data[0] # Assuming the first element of the tuple is the DataFrame
st.session_state.processed_data = loaded_data
st.success('Project data loaded successfully.')
# Load recorded steps if exist
steps = projects.load_processing_steps(selected_project)
if steps:
st.session_state.data_processing_recorder.load_steps(steps)
else:
st.error('No processed data found for this project, please upload new data.')
if 'selected_project' in st.session_state and st.session_state.selected_project:
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file:
# Check if the uploaded file is different from the previously processed file
if st.session_state.uploaded_file_name != uploaded_file.name:
st.session_state.uploaded_file_name = uploaded_file.name
data = data_processing.read_csv(uploaded_file)
st.session_state.processed_data = data.copy()
projects.save_project_data(st.session_state.selected_project, data)
# Check if processed_data is in session state and is a DataFrame and not empty
if ('processed_data' in st.session_state and
isinstance(st.session_state.processed_data, pd.DataFrame) and
not st.session_state.processed_data.empty):
st.write(st.session_state.processed_data.head())
target_column = streamlit_utils.select_target_column(st.session_state.processed_data)
st.write(f"Selected Target Column: {target_column}")
with st.expander("Drop Columns"):
columns_to_drop = st.multiselect("Select columns to drop", st.session_state.processed_data.columns)
if columns_to_drop and st.button("Drop Selected Columns"):
st.session_state.processed_data = data_processing.drop_columns(st.session_state.processed_data, columns_to_drop)
# Record the step
st.session_state.data_processing_recorder.record_step('drop_columns', columns=columns_to_drop)
projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
st.write(st.session_state.processed_data.head())
with st.expander("Handle Missing Values"):
strategy_map = {
"Drop Rows": "drop",
"Fill with Mean": "mean",
"Fill with Median": "median",
"Fill with Mode": "mode"
}
strategy = st.radio("Choose a strategy", list(strategy_map.keys()))
if st.button("Apply Strategy"):
strategy_code = strategy_map[strategy]
st.session_state.processed_data = data_processing.handle_missing_values(st.session_state.processed_data, strategy_code)
# Record the step
st.session_state.data_processing_recorder.record_step('handle_missing_values', strategy=strategy_code)
projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
st.write(st.session_state.processed_data.head())
with st.expander("Convert Categorical Columns to Numerical"):
columns_to_convert = st.multiselect("Select columns to convert", st.session_state.processed_data.select_dtypes(include=['object']).columns)
if columns_to_convert and st.button("Convert"):
st.session_state.processed_data = data_processing.convert_categorical_to_numerical(st.session_state.processed_data, columns_to_convert)
# Record the step
st.session_state.data_processing_recorder.record_step('convert_categorical_to_numerical', columns=columns_to_convert)
projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
st.write(st.session_state.processed_data.head())
with st.expander("Feature Extraction"):
extracted_data, extraction_params = streamlit_utils.feature_extraction_ui(
st.session_state.processed_data, target_column=target_column)
if extracted_data is not None:
st.session_state.processed_data = extracted_data
# Record the step with the extraction method and parameters
st.session_state.data_processing_recorder.record_step('feature_extraction', **extraction_params)
projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
st.dataframe(st.session_state.processed_data)
with st.expander("Filter Data"):
filtered_data = streamlit_utils.apply_filters(st.session_state.processed_data)
if filtered_data is not None:
st.session_state.processed_data = filtered_data
# Record the step
st.session_state.data_processing_recorder.record_step('filter_data')
projects.save_project_data(st.session_state.selected_project, st.session_state.processed_data)
st.dataframe(st.session_state.processed_data)
with st.expander("Model Training"):
if 'selected_project' in st.session_state and st.session_state.selected_project:
results = streamlit_utils.train_model_ui(st.session_state.processed_data, target_column, st.session_state.selected_project)
if results:
st.write(results)
# Record the step
st.session_state.data_processing_recorder.record_step('model_training', target_column=target_column)
with st.expander("Apply Saved Pipeline to New Data"):
new_data_file = st.file_uploader("Upload New Data CSV File", type="csv")
target_column = st.text_input("Enter the name of the target column (it will be excluded from processing):")
if new_data_file and target_column and 'selected_project' in st.session_state and st.session_state.selected_project:
# Define the path to the steps file
steps_file_path = os.path.join('projects', st.session_state.selected_project, 'processing_steps.json')
# Check if the steps file exists
if os.path.exists(steps_file_path):
# Load the steps from the JSON file
with open(steps_file_path, 'r') as file:
steps = json.load(file)
# Create a new DataProcessingPipeline instance
pipeline = DataProcessingPipeline()
# Load the new data
new_data = pd.read_csv(new_data_file)
# Remove the target column before applying the pipeline
if target_column in new_data.columns:
new_data = new_data.drop(columns=[target_column])
# Apply each step in the pipeline
try:
for step in steps:
step_name = step["step"]
parameters = step.get("parameters", {})
if hasattr(pipeline, step_name):
# Get the function associated with the step name
step_function = getattr(pipeline, step_name)
# Call the function with the arguments from the steps file
new_data = step_function(new_data, **parameters)
else:
st.error(f"Step '{step_name}' is not a method of DataProcessingPipeline")
st.success('Pipeline applied successfully.')
st.write(new_data.head())
# Save the processed new data to a file
processed_new_data_path = os.path.join('projects', st.session_state.selected_project, 'processed_new_data.csv')
new_data.to_csv(processed_new_data_path, index=False)
st.session_state['processed_new_data_path'] = processed_new_data_path
st.success(f"Processed new data saved to {processed_new_data_path}")
# Add a button to download the processed new data
with open(processed_new_data_path, "rb") as f:
# Read the saved processed data file
processed_new_data = f.read()
st.download_button(
label="Download Processed Data as CSV",
data=processed_new_data,
file_name="processed_new_data.csv",
mime="text/csv",
key='download-csv'
)
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.error(f"Steps file does not exist: {steps_file_path}")
with st.expander("Model Prediction"):
# Path to the trained model
model_path = os.path.join('projects', st.session_state.selected_project, 'model.joblib')
# Path where the predictions should be saved
predictions_path = os.path.join('projects', st.session_state.selected_project, 'predictions.csv')
# Load the model
try:
model = predictions.load_model(model_path)
st.success("Model loaded successfully.")
except Exception as e:
st.error(f"An error occurred while loading the model: {e}")
model = None
# Load the processed data for predictions
if 'processed_new_data_path' in st.session_state:
try:
data_for_prediction = pd.read_csv(st.session_state['processed_new_data_path'])
st.write("Data for prediction loaded successfully.")
st.write(data_for_prediction.head())
except Exception as e:
st.error(f"An error occurred while loading the data for prediction: {e}")
data_for_prediction = None
else:
st.warning("No processed data available for predictions. Please apply the pipeline to new data first.")
data_for_prediction = None
# Make predictions and save results
if model and data_for_prediction is not None:
if st.button("Make Predictions"):
try:
preds = predictions.predict(model, data_for_prediction)
st.success("Predictions made successfully.")
# Here I am not writing preds directly because I want to show them with the features
combined_results = data_for_prediction.copy()
combined_results['Predictions'] = preds
st.write(combined_results.head())
# Save the predictions along with the features
if predictions.save_predictions(data_for_prediction, preds, predictions_path):
st.success(f"Predictions saved to {predictions_path}")
# Create a link to download the predictions CSV
with open(predictions_path, "rb") as file:
btn = st.download_button(
label="Download Predictions CSV",
data=file,
file_name="predictions.csv",
mime="text/csv",
)
except Exception as e:
st.error(f"An error occurred while making predictions: {e}")
# Save the processing steps to the project after each interaction
projects.save_processing_steps(st.session_state.selected_project, st.session_state.data_processing_recorder.save_steps())
if __name__ == "__main__":
main()