diff --git a/.gitignore b/.gitignore index 73e0641..955fea2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,17 @@ Dataset/* !Dataset/.gitkeep .venv/* +ControlDataset/* +Models/* +NotInUseModels/* +Result/* +Scalers/* +TestDataset/* +inference_results.csv +FeatureWeights/* +CombinedFeatureWeights.csv Bladder/Dataset/* Breast/Dataset/* Brain/Dataset/* Liver/Dataset/* -Test_data/* \ No newline at end of file +Test_data/* diff --git a/Baseline/baseline_aggregate_and_infer.ipynb b/Baseline/baseline_aggregate_and_infer.ipynb new file mode 100644 index 0000000..04ce609 --- /dev/null +++ b/Baseline/baseline_aggregate_and_infer.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import joblib\n", + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_models(models_folder=\"Models\"):\n", + " \"\"\"Load all saved models from the specified folder.\"\"\"\n", + " models = {}\n", + " for file_name in os.listdir(models_folder):\n", + " if file_name.endswith(\".joblib\"):\n", + " model_name, cancer_type = file_name.split(\"_\")\n", + " model_path = os.path.join(models_folder, file_name)\n", + " models[(model_name, cancer_type)] = joblib.load(model_path)\n", + " print(f\"Loaded model: {model_name} for cancer type: {cancer_type}\")\n", + " return models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_data(new_data_path, scaler=None):\n", + " \"\"\"Load and preprocess new data.\"\"\"\n", + " # Load the data\n", + " data = pd.read_csv(new_data_path)\n", + " X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')\n", + " \n", + " # standardize the data\n", + " if scaler is None:\n", + " scaler = StandardScaler()\n", + " X_scaled = scaler.fit_transform(X)\n", + " else:\n", + " X_scaled = scaler.transform(X)\n", + " \n", + " return X_scaled, data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_inference(models, test_data_path, scalers_folder=\"Scalers\"):\n", + " # Load the test data\n", + " test_df = pd.read_csv(test_data_path)\n", + " test_features = test_df.drop(['cancer_type', 'type'], axis=1)\n", + "\n", + " # List all scaler files in the Scalers folder\n", + " scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]\n", + " \n", + " results = []\n", + " \n", + " for index, row in test_df.iterrows():\n", + " row_predictions = {} # Store model predictions and their confidence\n", + " \n", + " for (model_name, cancer_type), model in models.items():\n", + " # Find the scaler corresponding to the cancer type\n", + " cancer_type, _ = cancer_type.split(\".\")\n", + " scaler_filename = f\"{cancer_type}_scaler.joblib\"\n", + " if scaler_filename in scaler_files:\n", + " scaler_path = os.path.join(scalers_folder, scaler_filename)\n", + " scaler = joblib.load(scaler_path)\n", + " test_features_scaled = scaler.transform([test_features.iloc[index]]) # Transform a single row\n", + " \n", + " # Run inference with the model\n", + " probabilities = model.predict_proba(test_features_scaled)\n", + " confidence = probabilities[0][1]\n", + " \n", + " # Store the model's prediction and confidence\n", + " row_predictions[f\"{model_name}-{cancer_type}\"] = {\n", + " 'cancer_type': cancer_type,\n", + " 'predicted_class': model.predict(test_features_scaled)[0],\n", + " 'confidence': confidence,\n", + " 'probabilities': probabilities[0].tolist()\n", + " }\n", + " else:\n", + " print(f\"couldn't find {cancer_type}_scaler.joblib\")\n", + "\n", + " # Determine the final prediction based on the highest confidence\n", + " final_prediction = \"normal\"\n", + " max_confidence = -1\n", + " \n", + " for model_name, prediction_info in row_predictions.items():\n", + " if prediction_info['confidence'] > max_confidence and prediction_info['confidence'] > 0.5:\n", + " max_confidence = prediction_info['confidence']\n", + " final_prediction = prediction_info['cancer_type']\n", + "\n", + " # Append the final prediction for the current data point\n", + " results.append({\n", + " 'index': index,\n", + " 'cancer_type': final_prediction,\n", + " 'confidence': max_confidence if final_prediction is not \"normal\" else -1.0,\n", + " 'predictions': row_predictions\n", + " })\n", + "\n", + " return results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_inference_results(results, output_file=\"inference_results.csv\"):\n", + " # Convert the results to a DataFrame\n", + " df_results = pd.DataFrame(results)\n", + " df_results.to_csv(output_file, index=False)\n", + " print(f\"Inference results saved to {output_file}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the updated function\n", + "models_folder = \"Models\"\n", + "new_data_path = \"TestDataset/test_data.csv\"\n", + "\n", + "# Load models\n", + "models = load_models(models_folder=models_folder)\n", + "\n", + "\n", + "# Run inference\n", + "inference_results = run_inference(models, new_data_path)\n", + "\n", + "# Save results\n", + "save_inference_results(inference_results, output_file=\"inference_results.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 88.24%\n" + ] + } + ], + "source": [ + "test_df = pd.read_csv(\"TestDataset/test_data.csv\")\n", + "\n", + "predictions_df = pd.read_csv(\"inference_results.csv\")\n", + "\n", + "\n", + "accuracy = (predictions_df[\"cancer_type\"] == test_df[\"cancer_type\"]).mean()\n", + "print(f\"Accuracy: {accuracy:.2%}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Baseline/baseline_train_and_evaluate.ipynb b/Baseline/baseline_train_and_evaluate.ipynb new file mode 100644 index 0000000..d9d3440 --- /dev/null +++ b/Baseline/baseline_train_and_evaluate.ipynb @@ -0,0 +1,349 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.dummy import DummyClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import LeaveOneOut\n", + "from datetime import datetime\n", + "import os\n", + "import joblib\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def load_and_combine_datasets(control_path, cancer_path):\n", + " # Load datasets\n", + " control_df = pd.read_csv(control_path)\n", + " cancer_df = pd.read_csv(cancer_path)\n", + " \n", + " # Determine the size for undersampling\n", + " sample_size = len(cancer_df)\n", + " \n", + " # Undersample the healthy dataset\n", + " control_df_sample = control_df.sample(n=sample_size)\n", + " \n", + " # Combine datasets and shuffle\n", + " combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)\n", + " \n", + " # Split into features and labels\n", + " X = combined_df.drop(['cancer_type', 'type'], axis=1)\n", + " y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0) # 1 = cancer, 0 = healthy\n", + " \n", + " return X, y\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_model(model, model_name, cancer_type, folder=\"Models\"):\n", + " \"\"\"Save the model to a file using joblib.\"\"\"\n", + " if not os.path.exists(folder):\n", + " os.makedirs(folder)\n", + " model_filename = f\"{folder}/{model_name}_{cancer_type}.joblib\"\n", + " joblib.dump(model, model_filename)\n", + " print(f\"Model {model_name} saved to {model_filename}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train and evaluate models using LOOCV\n", + "def train_and_evaluate_loocv(X, y, models, cancer_type):\n", + " loo = LeaveOneOut()\n", + " results = {}\n", + "\n", + " for model_name, (model, param_grid) in models.items():\n", + " print(f\"Training {model_name} with LOOCV...\")\n", + " fold_scores = []\n", + " all_y_true = []\n", + " all_y_pred = []\n", + "\n", + " # GridSearch for hyperparameter tuning\n", + " grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n", + " grid_search.fit(X, y)\n", + " best_model = grid_search.best_estimator_\n", + " # Save the best model\n", + " save_model(best_model, model_name, cancer_type)\n", + "\n", + " # perform LOOCV\n", + " for train_index, test_index in loo.split(X):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = y[train_index], y[test_index]\n", + "\n", + " best_model.fit(X_train, y_train)\n", + " y_pred = best_model.predict(X_test)\n", + "\n", + " # Track results\n", + " fold_scores.append(best_model.score(X_test, y_test))\n", + " all_y_true.extend(y_test)\n", + " all_y_pred.extend(y_pred)\n", + "\n", + " # overall stats\n", + " avg_score = np.mean(fold_scores)\n", + " classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n", + "\n", + " # Store results\n", + " results[model_name] = {\n", + " 'best_params': grid_search.best_params_,\n", + " 'avg_loocv_score': avg_score,\n", + " 'classification_report': classification_report_dict\n", + " }\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def save_results_to_csv(results, output_file):\n", + " # Convert the results dictionary into a DataFrame\n", + " print(results)\n", + " rows = []\n", + " for model_name, result in results.items():\n", + " print(result)\n", + " row = {\n", + " 'model': model_name,\n", + " 'best_params': result['best_params'],\n", + " 'avg loocv score': result['avg_loocv_score'],\n", + " 'classification_report': str(result['classification_report']) # Serialize the report as a string\n", + " }\n", + " rows.append(row)\n", + " \n", + " df = pd.DataFrame(rows)\n", + " df.to_csv(output_file, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define models and parameter grids\n", + "models_to_test = {\n", + " 'Random Forest': (RandomForestClassifier(random_state=42), {\n", + " 'n_estimators': [50, 100],\n", + " 'max_depth': [None, 10]\n", + " }),\n", + " # 'SVM': (SVC(kernel='linear', random_state=42), {\n", + " # 'C': [0.1, 1, 10]\n", + " # }),\n", + " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n", + " 'C': [0.1, 1]\n", + " }),\n", + " # 'Naive Bayes': (GaussianNB(), {}),\n", + " # 'KNN': (KNeighborsClassifier(), {\n", + " # 'n_neighbors': [3, 5, 7, 10],\n", + " # 'weights': ['uniform', 'distance']\n", + " # }),\n", + " # 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}), # ZeroR always predicts the most frequent class\n", + "\n", + "}\n", + "\n", + "all_results = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing datasets: Dataset/throat.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_throat.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_throat.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n", + "Pipeline complete. Results saved to Result/results_throat_pca_20241130_210102.csv\n", + "Processing datasets: Dataset/prostate.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_prostate.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_prostate.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}\n", + "Pipeline complete. Results saved to Result/results_prostate_pca_20241130_210230.csv\n", + "Processing datasets: Dataset/leukemia.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_leukemia.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_leukemia.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n", + "Pipeline complete. Results saved to Result/results_leukemia_pca_20241130_210320.csv\n", + "Processing datasets: Dataset/lung.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_lung.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_lung.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}\n", + "Pipeline complete. Results saved to Result/results_lung_pca_20241130_211200.csv\n", + "Processing datasets: Dataset/colorectal.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_colorectal.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_colorectal.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}\n", + "Pipeline complete. Results saved to Result/results_colorectal_pca_20241130_212335.csv\n", + "Processing datasets: Dataset/liver.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_liver.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_liver.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "Pipeline complete. Results saved to Result/results_liver_pca_20241130_212555.csv\n", + "Processing datasets: Dataset/renal.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_renal.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_renal.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}\n", + "Pipeline complete. Results saved to Result/results_renal_pca_20241130_213009.csv\n", + "Processing datasets: Dataset/bladder.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_bladder.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_bladder.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n", + "Pipeline complete. Results saved to Result/results_bladder_pca_20241130_213111.csv\n", + "Processing datasets: Dataset/gastric.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_gastric.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_gastric.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}\n", + "Pipeline complete. Results saved to Result/results_gastric_pca_20241130_213147.csv\n", + "Processing datasets: Dataset/brain.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_brain.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_brain.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n", + "Pipeline complete. Results saved to Result/results_brain_pca_20241130_213953.csv\n", + "Processing datasets: Dataset/breast.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_breast.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_breast.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}\n", + "Pipeline complete. Results saved to Result/results_breast_pca_20241130_221107.csv\n", + "Processing datasets: Dataset/pancreatic.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_pancreatic.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_pancreatic.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n", + "Pipeline complete. Results saved to Result/results_pancreatic_pca_20241130_221213.csv\n" + ] + } + ], + "source": [ + "\n", + "dataset_files = [f for f in os.listdir(\"Dataset\") if f.endswith('.csv')]\n", + "for dataset_file in dataset_files:\n", + " file_path = os.path.join(\"Dataset\", dataset_file)\n", + "\n", + " cancer_type = os.path.splitext(file_path)[0].split('/')[-1]\n", + " print(f\"Processing datasets: {file_path}\")\n", + "\n", + " X, y = load_and_combine_datasets(\"ControlDataset/normal.csv\", file_path)\n", + "\n", + " scaler = StandardScaler()\n", + " X_scaled = scaler.fit_transform(X)\n", + " joblib.dump(scaler, f\"Scalers/{cancer_type}_scaler.joblib\")\n", + "\n", + "\n", + " all_results = {}\n", + "\n", + "\n", + " # Train and evaluate models using LOOCV\n", + " model_results = train_and_evaluate_loocv(X_scaled, y, models_to_test, cancer_type)\n", + " all_results.update({model_name: result for model_name, result in model_results.items()})\n", + "\n", + " \n", + " # save total results\n", + " csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", + " save_results_to_csv(results=all_results, output_file=csv_filename)\n", + "\n", + " print(\"Pipeline complete. Results saved to \" + csv_filename)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Baseline/baseline_visualize_logistic_model.ipynb b/Baseline/baseline_visualize_logistic_model.ipynb new file mode 100644 index 0000000..4c90610 --- /dev/null +++ b/Baseline/baseline_visualize_logistic_model.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define paths\n", + "models_folder = \"Models\" # Path to the folder containing model files\n", + "test_data_path = \"TestDataset/test_data.csv\" # Path to test data\n", + "output_folder = \"FeatureWeights\" # Folder to save output CSVs\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "feature_names = pd.read_csv(test_data_path, nrows=0).columns.tolist()[2:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing model: Logistic Regression_pancreatic.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_pancreatic_weights.csv\n", + "Processing model: Logistic Regression_brain.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_brain_weights.csv\n", + "Processing model: Logistic Regression_leukemia.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_leukemia_weights.csv\n", + "Processing model: Logistic Regression_gastric.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_gastric_weights.csv\n", + "Processing model: Logistic Regression_colorectal.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_colorectal_weights.csv\n", + "Processing model: Logistic Regression_lung.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_lung_weights.csv\n", + "Processing model: Logistic Regression_breast.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_breast_weights.csv\n", + "Processing model: Logistic Regression_renal.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_renal_weights.csv\n", + "Processing model: Logistic Regression_liver.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_liver_weights.csv\n", + "Processing model: Logistic Regression_bladder.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_bladder_weights.csv\n", + "Processing model: Logistic Regression_throat.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_throat_weights.csv\n", + "Processing model: Logistic Regression_prostate.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_prostate_weights.csv\n", + "Processing complete. All feature weights saved.\n" + ] + } + ], + "source": [ + "# Iterate through all joblib files in the Models folder\n", + "for model_file in os.listdir(models_folder):\n", + " if model_file.endswith(\".joblib\"):\n", + " model_path = os.path.join(models_folder, model_file)\n", + " print(f\"Processing model: {model_file}\")\n", + " \n", + " model = joblib.load(model_path)\n", + "\n", + " # Extract coefficients\n", + " coefficients = model.coef_[0]\n", + " \n", + " # Pair feature names with coefficients\n", + " feature_weights = pd.DataFrame({\n", + " \"Feature\": feature_names,\n", + " \"Weight\": coefficients\n", + " })\n", + "\n", + " feature_weights = feature_weights.sort_values(by=\"Weight\", ascending=False)\n", + "\n", + " output_file = os.path.join(output_folder, f\"{os.path.splitext(model_file)[0]}_weights.csv\")\n", + " feature_weights.to_csv(output_file, index=False)\n", + "\n", + " print(f\"Saved feature weights to: {output_file}\")\n", + "\n", + "print(\"Processing complete. All feature weights saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def combine_feature_weights(feature_weights_folder, output_file):\n", + " combined_data = [] # To store all rows of the final table\n", + "\n", + " # Iterate through all CSV files in the FeatureWeights folder\n", + " for file_name in os.listdir(feature_weights_folder):\n", + " if file_name.endswith(\"_weights.csv\"):\n", + " cancer_type = file_name.split(\"_weights.csv\")[0]\n", + " \n", + " # Load feature weights\n", + " file_path = os.path.join(feature_weights_folder, file_name)\n", + " df = pd.read_csv(file_path)\n", + " \n", + " # Get top 5 positive and negative weights\n", + " top_positive = df.nlargest(5, \"Weight\").reset_index(drop=True)\n", + " top_negative = df.nsmallest(5, \"Weight\").reset_index(drop=True)\n", + " \n", + " # Prepare a row for this cancer type\n", + " row = {\"Cancer Type\": cancer_type}\n", + " for i in range(5): # Add up to 5 features and weights\n", + " row[f\"Positive Feature {i+1}\"] = (\n", + " top_positive.at[i, \"Feature\"] if i < len(top_positive) else \"\"\n", + " )\n", + "\n", + " for i in range(5):\n", + " row[f\"Negative Feature {i+1}\"] = (\n", + " top_negative.at[i, \"Feature\"] if i < len(top_negative) else \"\"\n", + " )\n", + " \n", + " # Add weights in separate columns\n", + " for i in range(5):\n", + " row[f\"Positive Weight {i+1}\"] = (\n", + " top_positive.at[i, \"Weight\"] if i < len(top_positive) else \"\"\n", + " )\n", + " \n", + " for i in range(5):\n", + " row[f\"Negative Weight {i+1}\"] = (\n", + " top_negative.at[i, \"Weight\"] if i < len(top_negative) else \"\"\n", + " )\n", + " combined_data.append(row)\n", + " \n", + " # Save to a CSV file\n", + " combined_df = pd.DataFrame(combined_data)\n", + " combined_df.to_csv(output_file, index=False)\n", + " print(f\"Combined feature weights saved to: {output_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Combined feature weights saved to: CombinedFeatureWeights.csv\n" + ] + } + ], + "source": [ + "# Specify input and output paths\n", + "feature_weights_folder = \"FeatureWeights\"\n", + "output_file = \"CombinedFeatureWeights.csv\" \n", + "\n", + "combine_feature_weights(feature_weights_folder, output_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/clustering.ipynb b/Clustering/clustering.ipynb similarity index 100% rename from clustering.ipynb rename to Clustering/clustering.ipynb