From c7c87cb878008d7a59414dcbfe3d3fdbf34b12a5 Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Mon, 18 Nov 2024 20:23:12 -0500
Subject: [PATCH 1/7] made dataset to model produce csv file

---
 datasets_to_model.ipynb | 70 ++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb
index def1a8d..1c1668a 100644
--- a/datasets_to_model.ipynb
+++ b/datasets_to_model.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,10 +61,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Train and evaluate models\n",
     "def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, models):\n",
     "    results = {}\n",
     "    \n",
@@ -73,14 +74,13 @@
     "        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n",
     "        grid_search.fit(X_train, y_train)\n",
     "        \n",
-    "        # Save best parameters and performance\n",
     "        best_model = grid_search.best_estimator_\n",
     "        train_score = grid_search.best_score_\n",
     "        val_score = best_model.score(X_val, y_val)\n",
     "        test_score = best_model.score(X_test, y_test)\n",
     "        y_pred = best_model.predict(X_test)\n",
-    "        \n",
-    "        results[model_name] = {\n",
+    "\n",
+    "        results[(model_name, X_train.shape[1])] = {\n",
     "            'best_params': grid_search.best_params_,\n",
     "            'train_score': train_score,\n",
     "            'val_score': val_score,\n",
@@ -93,18 +93,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def save_results(results, output_file):\n",
-    "    with open(output_file, 'w') as f:\n",
-    "        json.dump(results, f, indent=4)"
+    "def save_results_to_csv(results, output_file):\n",
+    "    # Convert the results dictionary into a DataFrame\n",
+    "    print(results)\n",
+    "    rows = []\n",
+    "    for model_name, result in results.items():\n",
+    "        row = {\n",
+    "            'model': model_name[0],\n",
+    "            'pca_size': model_name[1],\n",
+    "            'best_params': result['best_params'],\n",
+    "            'train_score': result['train_score'],\n",
+    "            'val_score': result['val_score'],\n",
+    "            'test_score': result['test_score'],\n",
+    "            'classification_report': str(result['classification_report'])  # Serialize the report as a string\n",
+    "        }\n",
+    "        rows.append(row)\n",
+    "    \n",
+    "    df = pd.DataFrame(rows)\n",
+    "    df.to_csv(output_file, index=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -123,12 +138,13 @@
     "    })\n",
     "}\n",
     "\n",
-    "pca_to_test = [0, 10, 100, 160]"
+    "pca_to_test = [0, 10, 100, 160]\n",
+    "all_results = {}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,7 +178,19 @@
       "Training Random Forest...\n",
       "Training SVM...\n",
       "Training Logistic Regression...\n",
-      "Pipeline complete. Results saved to 'results.json'.\n"
+      "{'Random Forest': {}, ('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9813650128115537, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'SVM': {}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'Logistic Regression': {}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.9442115071045888, 'val_score': 0.9705882352941176, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 10): {'best_params': {'C': 1, 'kernel': 'rbf'}, 'train_score': 0.9437456324248776, 'val_score': 0.9705882352941176, 'test_score': 0.9714285714285714, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9444444444444444, 'f1-score': 0.9714285714285714, 'support': 18.0}, '1': {'precision': 0.9444444444444444, 'recall': 1.0, 'f1-score': 0.9714285714285714, 'support': 17.0}, 'accuracy': 0.9714285714285714, 'macro avg': {'precision': 0.9722222222222222, 'recall': 0.9722222222222222, 'f1-score': 0.9714285714285714, 'support': 35.0}, 'weighted avg': {'precision': 0.9730158730158731, 'recall': 0.9714285714285714, 'f1-score': 0.9714285714285714, 'support': 35.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 1}, 'train_score': 0.9379221989284883, 'val_score': 1.0, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9444444444444444, 'recall': 0.9444444444444444, 'f1-score': 0.9444444444444444, 'support': 18.0}, '1': {'precision': 0.9411764705882353, 'recall': 0.9411764705882353, 'f1-score': 0.9411764705882353, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.9428104575163399, 'recall': 0.9428104575163399, 'f1-score': 0.9428104575163399, 'support': 35.0}, 'weighted avg': {'precision': 0.9428571428571428, 'recall': 0.9428571428571428, 'f1-score': 0.9428571428571428, 'support': 35.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9443279757745167, 'val_score': 0.9411764705882353, 'test_score': 0.9142857142857143, 'classification_report': {'0': {'precision': 0.8571428571428571, 'recall': 1.0, 'f1-score': 0.923076923076923, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8235294117647058, 'f1-score': 0.9032258064516129, 'support': 17.0}, 'accuracy': 0.9142857142857143, 'macro avg': {'precision': 0.9285714285714286, 'recall': 0.9117647058823529, 'f1-score': 0.913151364764268, 'support': 35.0}, 'weighted avg': {'precision': 0.926530612244898, 'recall': 0.9142857142857143, 'f1-score': 0.9134349521446294, 'support': 35.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9753086419753086, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.9876543209876543, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 160): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9318658280922433, 'val_score': 0.9411764705882353, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 160): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 160): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}}\n"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'best_params'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 32\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[38;5;66;03m# save total resutls\u001b[39;00m\n\u001b[1;32m     31\u001b[0m csv_filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResult/results_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcancer_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_pca_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdatetime\u001b[38;5;241m.\u001b[39mnow()\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mS\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[43msave_results_to_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcsv_filename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline complete. Results saved to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults.json\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[11], line 10\u001b[0m, in \u001b[0;36msave_results_to_csv\u001b[0;34m(results, output_file)\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m model_name, result \u001b[38;5;129;01min\u001b[39;00m results\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m pca_size, performance \u001b[38;5;129;01min\u001b[39;00m result\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m      7\u001b[0m         row \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m      8\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m: model_name,\n\u001b[1;32m      9\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpca_size\u001b[39m\u001b[38;5;124m'\u001b[39m: pca_size,\n\u001b[0;32m---> 10\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbest_params\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mperformance\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbest_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[1;32m     11\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     12\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     13\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     14\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mstr\u001b[39m(performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m])  \u001b[38;5;66;03m# Serialize the report as a string\u001b[39;00m\n\u001b[1;32m     15\u001b[0m         }\n\u001b[1;32m     16\u001b[0m         rows\u001b[38;5;241m.\u001b[39mappend(row)\n\u001b[1;32m     18\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(rows)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'best_params'"
      ]
     }
    ],
@@ -178,6 +206,7 @@
     "X_val_scaled = scaler.transform(X_val)\n",
     "X_test_scaled = scaler.transform(X_test)\n",
     "\n",
+    "\n",
     "for num_pca in pca_to_test:\n",
     "    print(f\"running pca on {num_pca} features\")\n",
     "    # Run PCA feature reduction\n",
@@ -191,11 +220,14 @@
     "        X_test_pca = pca.transform(X_test_scaled)\n",
     "\n",
     "    # Train and evaluate models\n",
-    "    results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n",
+    "    model_results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n",
     "\n",
-    "    # Save results\n",
-    "    filename = f\"Result/results_{cancer_type}_pca_{num_pca}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n",
-    "    save_results(results, filename)\n",
+    "    # Update results\n",
+    "    all_results.update(model_results)\n",
+    "    \n",
+    "# save total resutls\n",
+    "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n",
+    "save_results_to_csv(results=all_results, output_file=csv_filename)\n",
     "\n",
     "print(\"Pipeline complete. Results saved to 'results.json'.\")"
    ]

From a4436b07f1880ea5d61b89b9e0cfb380b54a7f99 Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Tue, 19 Nov 2024 16:09:21 -0500
Subject: [PATCH 2/7] added more models

---
 datasets_to_model.ipynb | 122 +++++++++++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 26 deletions(-)

diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb
index 1c1668a..c977fc2 100644
--- a/datasets_to_model.ipynb
+++ b/datasets_to_model.ipynb
@@ -13,6 +13,9 @@
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.svm import SVC\n",
     "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.dummy import DummyClassifier\n",
     "from sklearn.metrics import classification_report\n",
     "from sklearn.preprocessing import StandardScaler\n",
     "import json\n",
@@ -61,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -93,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,15 +122,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define models and parameter grids\n",
     "models_to_test = {\n",
     "    'Random Forest': (RandomForestClassifier(random_state=42), {\n",
-    "        'n_estimators': [50, 100],#, 200],\n",
-    "        'max_depth': [None, 10],#, 20]\n",
+    "        'n_estimators': [50, 100, 200],\n",
+    "        'max_depth': [None, 10, 20]\n",
     "    }),\n",
     "    'SVM': (SVC(random_state=42), {\n",
     "        'C': [0.1, 1, 10],\n",
@@ -135,22 +138,47 @@
     "    }),\n",
     "    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500), {\n",
     "        'C': [0.1, 1, 10]\n",
-    "    })\n",
+    "    }),\n",
+    "    'Naive Bayes': (GaussianNB(), {}),\n",
+    "    'KNN': (KNeighborsClassifier(), {\n",
+    "        'n_neighbors': [3, 5, 7, 10],\n",
+    "        'weights': ['uniform', 'distance']\n",
+    "    }),\n",
+    "    'ZeroR': (DummyClassifier(strategy='most_frequent'), {}),  # ZeroR always predicts the most frequent class\n",
+    "\n",
     "}\n",
     "\n",
-    "pca_to_test = [0, 10, 100, 160]\n",
+    "pca_to_test = [0, 10, 100]\n",
     "all_results = {}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'Dataset/pancreastic.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Load Data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_combine_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/normal.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/pancreastic.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m cancer_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpancreatic\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "Cell \u001b[0;32mIn[2], line 4\u001b[0m, in \u001b[0;36mload_and_combine_datasets\u001b[0;34m(control_path, cancer_path)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_and_combine_datasets\u001b[39m(control_path, cancer_path):\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;66;03m# Load datasets\u001b[39;00m\n\u001b[1;32m      3\u001b[0m     control_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(control_path)\n\u001b[0;32m----> 4\u001b[0m     cancer_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcancer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;66;03m# Determine the size for undersampling\u001b[39;00m\n\u001b[1;32m      7\u001b[0m     sample_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(cancer_df)\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    900\u001b[0m     dialect,\n\u001b[1;32m    901\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    908\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m    909\u001b[0m )\n\u001b[1;32m    910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    580\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1404\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1659\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1660\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1662\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1663\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1664\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1665\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1666\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1667\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1668\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1669\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    857\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    858\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    860\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    861\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    863\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    866\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    867\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    868\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Dataset/pancreastic.csv'"
+     ]
+    }
+   ],
    "source": [
     "# Load Data\n",
-    "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n",
-    "cancer_type = \"lung\""
+    "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/pancreatic.csv\")\n",
+    "cancer_type = \"pancreatic\""
    ]
   },
   {
@@ -166,31 +194,73 @@
       "Training Random Forest...\n",
       "Training SVM...\n",
       "Training Logistic Regression...\n",
+      "Training Naive Bayes...\n",
+      "Training KNN...\n",
+      "Training ZeroR...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "running pca on 10 features\n",
       "Training Random Forest...\n",
       "Training SVM...\n",
       "Training Logistic Regression...\n",
+      "Training Naive Bayes...\n",
+      "Training KNN...\n",
+      "Training ZeroR...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "running pca on 100 features\n",
       "Training Random Forest...\n",
       "Training SVM...\n",
       "Training Logistic Regression...\n",
-      "running pca on 160 features\n",
-      "Training Random Forest...\n",
-      "Training SVM...\n",
-      "Training Logistic Regression...\n",
-      "{'Random Forest': {}, ('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9813650128115537, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'SVM': {}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'Logistic Regression': {}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.9442115071045888, 'val_score': 0.9705882352941176, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 10): {'best_params': {'C': 1, 'kernel': 'rbf'}, 'train_score': 0.9437456324248776, 'val_score': 0.9705882352941176, 'test_score': 0.9714285714285714, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9444444444444444, 'f1-score': 0.9714285714285714, 'support': 18.0}, '1': {'precision': 0.9444444444444444, 'recall': 1.0, 'f1-score': 0.9714285714285714, 'support': 17.0}, 'accuracy': 0.9714285714285714, 'macro avg': {'precision': 0.9722222222222222, 'recall': 0.9722222222222222, 'f1-score': 0.9714285714285714, 'support': 35.0}, 'weighted avg': {'precision': 0.9730158730158731, 'recall': 0.9714285714285714, 'f1-score': 0.9714285714285714, 'support': 35.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 1}, 'train_score': 0.9379221989284883, 'val_score': 1.0, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9444444444444444, 'recall': 0.9444444444444444, 'f1-score': 0.9444444444444444, 'support': 18.0}, '1': {'precision': 0.9411764705882353, 'recall': 0.9411764705882353, 'f1-score': 0.9411764705882353, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.9428104575163399, 'recall': 0.9428104575163399, 'f1-score': 0.9428104575163399, 'support': 35.0}, 'weighted avg': {'precision': 0.9428571428571428, 'recall': 0.9428571428571428, 'f1-score': 0.9428571428571428, 'support': 35.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9443279757745167, 'val_score': 0.9411764705882353, 'test_score': 0.9142857142857143, 'classification_report': {'0': {'precision': 0.8571428571428571, 'recall': 1.0, 'f1-score': 0.923076923076923, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8235294117647058, 'f1-score': 0.9032258064516129, 'support': 17.0}, 'accuracy': 0.9142857142857143, 'macro avg': {'precision': 0.9285714285714286, 'recall': 0.9117647058823529, 'f1-score': 0.913151364764268, 'support': 35.0}, 'weighted avg': {'precision': 0.926530612244898, 'recall': 0.9142857142857143, 'f1-score': 0.9134349521446294, 'support': 35.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9753086419753086, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.9876543209876543, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 160): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9318658280922433, 'val_score': 0.9411764705882353, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 160): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 160): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}}\n"
+      "Training Naive Bayes...\n",
+      "Training KNN...\n",
+      "Training ZeroR...\n",
+      "{('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 54675): {'best_params': {}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 54675): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 54675): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('SVM', 10): {'best_params': {'C': 0.1, 'kernel': 'rbf'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'train_score': 0.9797979797979798, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 10): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 200}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 100): {'best_params': {}, 'train_score': 0.9696969696969697, 'val_score': 0.9545454545454546, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 100): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 100): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}}\n",
+      "Pipeline complete. Results saved to Result/results_liver_pca_20241119_160536.csv\n"
      ]
     },
     {
-     "ename": "KeyError",
-     "evalue": "'best_params'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[12], line 32\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[38;5;66;03m# save total resutls\u001b[39;00m\n\u001b[1;32m     31\u001b[0m csv_filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResult/results_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcancer_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_pca_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdatetime\u001b[38;5;241m.\u001b[39mnow()\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mS\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[43msave_results_to_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcsv_filename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline complete. Results saved to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults.json\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "Cell \u001b[0;32mIn[11], line 10\u001b[0m, in \u001b[0;36msave_results_to_csv\u001b[0;34m(results, output_file)\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m model_name, result \u001b[38;5;129;01min\u001b[39;00m results\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m pca_size, performance \u001b[38;5;129;01min\u001b[39;00m result\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m      7\u001b[0m         row \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m      8\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m: model_name,\n\u001b[1;32m      9\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpca_size\u001b[39m\u001b[38;5;124m'\u001b[39m: pca_size,\n\u001b[0;32m---> 10\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbest_params\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mperformance\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbest_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[1;32m     11\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     12\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     13\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     14\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mstr\u001b[39m(performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m])  \u001b[38;5;66;03m# Serialize the report as a string\u001b[39;00m\n\u001b[1;32m     15\u001b[0m         }\n\u001b[1;32m     16\u001b[0m         rows\u001b[38;5;241m.\u001b[39mappend(row)\n\u001b[1;32m     18\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(rows)\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'best_params'"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n"
      ]
     }
    ],
@@ -229,7 +299,7 @@
     "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n",
     "save_results_to_csv(results=all_results, output_file=csv_filename)\n",
     "\n",
-    "print(\"Pipeline complete. Results saved to 'results.json'.\")"
+    "print(\"Pipeline complete. Results saved to \" + csv_filename)"
    ]
   }
  ],

From fca045bfd887b36fff1bbfe4d3fdd72468025d32 Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Mon, 25 Nov 2024 14:27:42 -0500
Subject: [PATCH 3/7] using leave one out cross val and L1 now when applicable

---
 datasets_to_model.ipynb | 244 +++++++++++++++++-----------------------
 1 file changed, 105 insertions(+), 139 deletions(-)

diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb
index c977fc2..55c8fd9 100644
--- a/datasets_to_model.ipynb
+++ b/datasets_to_model.ipynb
@@ -18,7 +18,7 @@
     "from sklearn.dummy import DummyClassifier\n",
     "from sklearn.metrics import classification_report\n",
     "from sklearn.preprocessing import StandardScaler\n",
-    "import json\n",
+    "from sklearn.model_selection import LeaveOneOut\n",
     "from datetime import datetime\n"
    ]
   },
@@ -68,29 +68,47 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Train and evaluate models\n",
-    "def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, models):\n",
+    "# Train and evaluate models using LOOCV\n",
+    "def train_and_evaluate_loocv(X, y, models):\n",
+    "    loo = LeaveOneOut()\n",
     "    results = {}\n",
-    "    \n",
+    "\n",
     "    for model_name, (model, param_grid) in models.items():\n",
-    "        print(f\"Training {model_name}...\")\n",
+    "        print(f\"Training {model_name} with LOOCV...\")\n",
+    "        fold_scores = []\n",
+    "        all_y_true = []\n",
+    "        all_y_pred = []\n",
+    "\n",
+    "        # GridSearch for hyperparameter tuning\n",
     "        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n",
-    "        grid_search.fit(X_train, y_train)\n",
-    "        \n",
+    "        grid_search.fit(X, y)\n",
     "        best_model = grid_search.best_estimator_\n",
-    "        train_score = grid_search.best_score_\n",
-    "        val_score = best_model.score(X_val, y_val)\n",
-    "        test_score = best_model.score(X_test, y_test)\n",
-    "        y_pred = best_model.predict(X_test)\n",
     "\n",
-    "        results[(model_name, X_train.shape[1])] = {\n",
+    "        # Perform LOOCV\n",
+    "        for train_index, test_index in loo.split(X):\n",
+    "            X_train, X_test = X[train_index], X[test_index]\n",
+    "            y_train, y_test = y[train_index], y[test_index]\n",
+    "\n",
+    "            # Fit and predict with the best model\n",
+    "            best_model.fit(X_train, y_train)\n",
+    "            y_pred = best_model.predict(X_test)\n",
+    "\n",
+    "            # Track results\n",
+    "            fold_scores.append(best_model.score(X_test, y_test))\n",
+    "            all_y_true.extend(y_test)\n",
+    "            all_y_pred.extend(y_pred)\n",
+    "\n",
+    "        # Compute overall statistics\n",
+    "        avg_score = np.mean(fold_scores)\n",
+    "        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n",
+    "\n",
+    "        # Store results\n",
+    "        results[model_name] = {\n",
     "            'best_params': grid_search.best_params_,\n",
-    "            'train_score': train_score,\n",
-    "            'val_score': val_score,\n",
-    "            'test_score': test_score,\n",
-    "            'classification_report': classification_report(y_test, y_pred, output_dict=True)\n",
+    "            'avg_loocv_score': avg_score,\n",
+    "            'classification_report': classification_report_dict\n",
     "        }\n",
-    "    \n",
+    "\n",
     "    return results"
    ]
   },
@@ -105,13 +123,12 @@
     "    print(results)\n",
     "    rows = []\n",
     "    for model_name, result in results.items():\n",
+    "        print(result)\n",
     "        row = {\n",
     "            'model': model_name[0],\n",
     "            'pca_size': model_name[1],\n",
     "            'best_params': result['best_params'],\n",
-    "            'train_score': result['train_score'],\n",
-    "            'val_score': result['val_score'],\n",
-    "            'test_score': result['test_score'],\n",
+    "            'avg loocv score': result['avg_loocv_score'],\n",
     "            'classification_report': str(result['classification_report'])  # Serialize the report as a string\n",
     "        }\n",
     "        rows.append(row)\n",
@@ -132,11 +149,10 @@
     "        'n_estimators': [50, 100, 200],\n",
     "        'max_depth': [None, 10, 20]\n",
     "    }),\n",
-    "    'SVM': (SVC(random_state=42), {\n",
-    "        'C': [0.1, 1, 10],\n",
-    "        'kernel': ['linear', 'rbf']\n",
+    "    'SVM': (SVC(kernel='linear', random_state=42), {\n",
+    "        'C': [0.1, 1, 10]\n",
     "    }),\n",
-    "    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500), {\n",
+    "    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n",
     "        'C': [0.1, 1, 10]\n",
     "    }),\n",
     "    'Naive Bayes': (GaussianNB(), {}),\n",
@@ -148,154 +164,104 @@
     "\n",
     "}\n",
     "\n",
-    "pca_to_test = [0, 10, 100]\n",
+    "pca_to_test = [0, 10, 44]\n",
     "all_results = {}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CHANGE WHAT DATA IS BEING LOADED HERE"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: 'Dataset/pancreastic.csv'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Load Data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_combine_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/normal.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/pancreastic.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m cancer_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpancreatic\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "Cell \u001b[0;32mIn[2], line 4\u001b[0m, in \u001b[0;36mload_and_combine_datasets\u001b[0;34m(control_path, cancer_path)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_and_combine_datasets\u001b[39m(control_path, cancer_path):\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;66;03m# Load datasets\u001b[39;00m\n\u001b[1;32m      3\u001b[0m     control_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(control_path)\n\u001b[0;32m----> 4\u001b[0m     cancer_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcancer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;66;03m# Determine the size for undersampling\u001b[39;00m\n\u001b[1;32m      7\u001b[0m     sample_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(cancer_df)\n",
-      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    900\u001b[0m     dialect,\n\u001b[1;32m    901\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    908\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m    909\u001b[0m )\n\u001b[1;32m    910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    580\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
-      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1404\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1659\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1660\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1662\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1663\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1664\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1665\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1666\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1667\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1668\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1669\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
-      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    857\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    858\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    860\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    861\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    863\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    866\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    867\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    868\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Dataset/pancreastic.csv'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load Data\n",
-    "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/pancreatic.csv\")\n",
-    "cancer_type = \"pancreatic\""
+    "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n",
+    "cancer_type = \"lung\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "running pca on 0 features\n",
-      "Training Random Forest...\n",
-      "Training SVM...\n",
-      "Training Logistic Regression...\n",
-      "Training Naive Bayes...\n",
-      "Training KNN...\n",
-      "Training ZeroR...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "running pca on 10 features\n",
-      "Training Random Forest...\n",
-      "Training SVM...\n",
-      "Training Logistic Regression...\n",
-      "Training Naive Bayes...\n",
-      "Training KNN...\n",
-      "Training ZeroR...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "running pca on 100 features\n",
-      "Training Random Forest...\n",
-      "Training SVM...\n",
-      "Training Logistic Regression...\n",
-      "Training Naive Bayes...\n",
-      "Training KNN...\n",
-      "Training ZeroR...\n",
-      "{('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 54675): {'best_params': {}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 54675): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 54675): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('SVM', 10): {'best_params': {'C': 0.1, 'kernel': 'rbf'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'train_score': 0.9797979797979798, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 10): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 200}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 100): {'best_params': {}, 'train_score': 0.9696969696969697, 'val_score': 0.9545454545454546, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 100): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 100): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}}\n",
-      "Pipeline complete. Results saved to Result/results_liver_pca_20241119_160536.csv\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n"
+      "Running LOOCV with PCA on 0 features...\n",
+      "Training Random Forest with LOOCV...\n",
+      "Training SVM with LOOCV...\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Training Naive Bayes with LOOCV...\n",
+      "Training KNN with LOOCV...\n",
+      "Training ZeroR with LOOCV...\n",
+      "Running LOOCV with PCA on 10 features...\n",
+      "Training Random Forest with LOOCV...\n",
+      "Training SVM with LOOCV...\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Training Naive Bayes with LOOCV...\n",
+      "Training KNN with LOOCV...\n",
+      "Training ZeroR with LOOCV...\n",
+      "Running LOOCV with PCA on 44 features...\n",
+      "Training Random Forest with LOOCV...\n",
+      "Training SVM with LOOCV...\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Training Naive Bayes with LOOCV...\n",
+      "Training KNN with LOOCV...\n",
+      "Training ZeroR with LOOCV...\n",
+      "{('Random Forest', 0): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('SVM', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Logistic Regression', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 0): {'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('KNN', 0): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 0): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 10): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 44): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 44): {'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('KNN', 44): {'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 44): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
+      "{'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
+      "{'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
+      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_liver_pca_20241125_142350.csv\n"
      ]
     }
    ],
    "source": [
     "\n",
-    "# Split data\n",
-    "X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n",
-    "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n",
-    "\n",
-    "# Preprocess data\n",
+    "# Preprocess Data\n",
     "scaler = StandardScaler()\n",
-    "X_train_scaled = scaler.fit_transform(X_train)\n",
-    "X_val_scaled = scaler.transform(X_val)\n",
-    "X_test_scaled = scaler.transform(X_test)\n",
+    "X_scaled = scaler.fit_transform(X)\n",
+    "\n",
     "\n",
+    "all_results = {}\n",
     "\n",
     "for num_pca in pca_to_test:\n",
-    "    print(f\"running pca on {num_pca} features\")\n",
-    "    # Run PCA feature reduction\n",
+    "    print(f\"Running LOOCV with PCA on {num_pca} features...\")\n",
+    "    # Apply PCA if specified\n",
     "    if num_pca == 0:\n",
-    "        X_train_pca = X_train_scaled\n",
-    "        X_val_pca = X_val_scaled\n",
-    "        X_test_pca = X_test_scaled\n",
+    "        X_pca = X_scaled\n",
     "    else:\n",
-    "        X_train_pca, pca = pca_data(X_train_scaled, num_pca)\n",
-    "        X_val_pca = pca.transform(X_val_scaled)\n",
-    "        X_test_pca = pca.transform(X_test_scaled)\n",
+    "        X_pca, pca = pca_data(X_scaled, num_pca)\n",
     "\n",
-    "    # Train and evaluate models\n",
-    "    model_results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n",
+    "    # Train and evaluate models using LOOCV\n",
+    "    model_results = train_and_evaluate_loocv(X_pca, y, models_to_test)\n",
+    "    all_results.update({(model_name, num_pca): result for model_name, result in model_results.items()})\n",
     "\n",
-    "    # Update results\n",
-    "    all_results.update(model_results)\n",
     "    \n",
-    "# save total resutls\n",
+    "# save total results\n",
     "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n",
     "save_results_to_csv(results=all_results, output_file=csv_filename)\n",
     "\n",

From 1c5ad1c8d35321a05d3818271c48bb85e2a1bc6a Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Sat, 30 Nov 2024 23:43:44 -0500
Subject: [PATCH 4/7] reorganized and created aggregate model and inference
 file

---
 .gitignore                |   9 +-
 aggregate_and_infer.ipynb | 201 +++++++++++++++++++++
 datasets_to_model.ipynb   | 293 ------------------------------
 train_and_evaluate.ipynb  | 363 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 572 insertions(+), 294 deletions(-)
 create mode 100644 aggregate_and_infer.ipynb
 delete mode 100644 datasets_to_model.ipynb
 create mode 100644 train_and_evaluate.ipynb

diff --git a/.gitignore b/.gitignore
index 6be7e48..e5dd687 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,10 @@
 Dataset/*
 !Dataset/.gitkeep
-.venv/*
\ No newline at end of file
+.venv/*
+ControlDataset/*
+Models/*
+NotInUseModels/*
+Result/*
+Scalers/*
+TestDataset/*
+inference_results.csv
\ No newline at end of file
diff --git a/aggregate_and_infer.ipynb b/aggregate_and_infer.ipynb
new file mode 100644
index 0000000..dbaf357
--- /dev/null
+++ b/aggregate_and_infer.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import joblib\n",
+    "import pandas as pd\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_models(models_folder=\"Models\"):\n",
+    "    \"\"\"Load all saved models from the specified folder.\"\"\"\n",
+    "    models = {}\n",
+    "    for file_name in os.listdir(models_folder):\n",
+    "        if file_name.endswith(\".joblib\"):\n",
+    "            # Extract model name and cancer type from the filename\n",
+    "            model_name, cancer_type = file_name.split(\"_\")\n",
+    "            model_path = os.path.join(models_folder, file_name)\n",
+    "            models[(model_name, cancer_type)] = joblib.load(model_path)\n",
+    "            print(f\"Loaded model: {model_name} for cancer type: {cancer_type}\")\n",
+    "    return models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_data(new_data_path, scaler=None):\n",
+    "    \"\"\"Load and preprocess new data.\"\"\"\n",
+    "    # Load the data\n",
+    "    data = pd.read_csv(new_data_path)\n",
+    "    \n",
+    "    # Drop unnecessary columns (adjust this based on your dataset structure)\n",
+    "    X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')\n",
+    "    \n",
+    "    # Standardize the data\n",
+    "    if scaler is None:\n",
+    "        scaler = StandardScaler()\n",
+    "        X_scaled = scaler.fit_transform(X)\n",
+    "    else:\n",
+    "        X_scaled = scaler.transform(X)\n",
+    "    \n",
+    "    return X_scaled, data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_inference(models, test_data_path, scalers_folder=\"Scalers\"):\n",
+    "    # Load the test data\n",
+    "    test_df = pd.read_csv(test_data_path)\n",
+    "    test_features = test_df.drop(['cancer_type', 'type'], axis=1)  # Drop label columns if they exist\n",
+    "\n",
+    "    # List all scaler files in the Scalers folder\n",
+    "    scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]\n",
+    "    \n",
+    "    results = []\n",
+    "    \n",
+    "    for index, row in test_df.iterrows():\n",
+    "        row_predictions = {}  # Store model predictions and their confidence\n",
+    "        \n",
+    "        for (model_name, cancer_type), model in models.items():\n",
+    "            # Find the scaler corresponding to the cancer type\n",
+    "            cancer_type, _ = cancer_type.split(\".\")\n",
+    "            scaler_filename = f\"{cancer_type}_scaler.joblib\"\n",
+    "            if scaler_filename in scaler_files:\n",
+    "                scaler_path = os.path.join(scalers_folder, scaler_filename)\n",
+    "                scaler = joblib.load(scaler_path)\n",
+    "                test_features_scaled = scaler.transform([test_features.iloc[index]])  # Transform a single row\n",
+    "                \n",
+    "                # Run inference with the model\n",
+    "                probabilities = model.predict_proba(test_features_scaled)\n",
+    "                confidence = probabilities[0][1]\n",
+    "                \n",
+    "                # Store the model's prediction and confidence\n",
+    "                row_predictions[f\"{model_name}-{cancer_type}\"] = {\n",
+    "                    'cancer_type': cancer_type,\n",
+    "                    'predicted_class': model.predict(test_features_scaled)[0],\n",
+    "                    'confidence': confidence,\n",
+    "                    'probabilities': probabilities[0].tolist()\n",
+    "                }\n",
+    "            else:\n",
+    "                print(f\"couldn't find {cancer_type}_scaler.joblib\")\n",
+    "\n",
+    "        # Determine the final prediction based on the highest confidence\n",
+    "        final_prediction = \"normal\"\n",
+    "        max_confidence = -1\n",
+    "        \n",
+    "        for model_name, prediction_info in row_predictions.items():\n",
+    "            if prediction_info['confidence'] > max_confidence and prediction_info['confidence'] > 0.5:\n",
+    "                max_confidence = prediction_info['confidence']\n",
+    "                final_prediction = prediction_info['cancer_type']\n",
+    "\n",
+    "        # Append the final prediction for the current data point\n",
+    "        results.append({\n",
+    "            'index': index,\n",
+    "            'cancer_type': final_prediction,\n",
+    "            'confidence': max_confidence if final_prediction is not \"normal\" else -1.0,\n",
+    "            'predictions': row_predictions\n",
+    "        })\n",
+    "\n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_inference_results(results, output_file=\"inference_results.csv\"):\n",
+    "    # Convert the results to a DataFrame\n",
+    "    df_results = pd.DataFrame(results)\n",
+    "    df_results.to_csv(output_file, index=False)\n",
+    "    print(f\"Inference results saved to {output_file}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the updated function\n",
+    "models_folder = \"Models\"\n",
+    "new_data_path = \"TestDataset/test_data.csv\"\n",
+    "\n",
+    "# Load models\n",
+    "models = load_models(models_folder=models_folder)\n",
+    "\n",
+    "\n",
+    "# Run inference\n",
+    "inference_results = run_inference(models, new_data_path)\n",
+    "\n",
+    "# Save results\n",
+    "save_inference_results(inference_results, output_file=\"inference_results.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 88.24%\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df = pd.read_csv(\"TestDataset/test_data.csv\")\n",
+    "\n",
+    "predictions_df = pd.read_csv(\"inference_results.csv\")\n",
+    "\n",
+    "\n",
+    "accuracy = (predictions_df[\"cancer_type\"] == test_df[\"cancer_type\"]).mean()\n",
+    "print(f\"Accuracy: {accuracy:.2%}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb
deleted file mode 100644
index 55c8fd9..0000000
--- a/datasets_to_model.ipynb
+++ /dev/null
@@ -1,293 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
-    "from sklearn.decomposition import PCA\n",
-    "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.linear_model import LogisticRegression\n",
-    "from sklearn.naive_bayes import GaussianNB\n",
-    "from sklearn.neighbors import KNeighborsClassifier\n",
-    "from sklearn.dummy import DummyClassifier\n",
-    "from sklearn.metrics import classification_report\n",
-    "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.model_selection import LeaveOneOut\n",
-    "from datetime import datetime\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_and_combine_datasets(control_path, cancer_path):\n",
-    "    # Load datasets\n",
-    "    control_df = pd.read_csv(control_path)\n",
-    "    cancer_df = pd.read_csv(cancer_path)\n",
-    "    \n",
-    "    # Determine the size for undersampling\n",
-    "    sample_size = len(cancer_df)\n",
-    "    \n",
-    "    # Undersample the healthy dataset\n",
-    "    control_df_sample = control_df.sample(n=sample_size, random_state=42)\n",
-    "    \n",
-    "    # Combine datasets and shuffle\n",
-    "    combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)\n",
-    "    \n",
-    "    # Split into features and labels\n",
-    "    X = combined_df.drop(['cancer_type', 'type'], axis=1)\n",
-    "    y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0)  # 1 = cancer, 0 = healthy\n",
-    "    \n",
-    "    return X, y\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pca_data(X, n_components):\n",
-    "    # Apply PCA (assumes X is already standardized)\n",
-    "    pca = PCA(n_components=n_components)\n",
-    "    X_pca = pca.fit_transform(X)\n",
-    "    return X_pca, pca"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train and evaluate models using LOOCV\n",
-    "def train_and_evaluate_loocv(X, y, models):\n",
-    "    loo = LeaveOneOut()\n",
-    "    results = {}\n",
-    "\n",
-    "    for model_name, (model, param_grid) in models.items():\n",
-    "        print(f\"Training {model_name} with LOOCV...\")\n",
-    "        fold_scores = []\n",
-    "        all_y_true = []\n",
-    "        all_y_pred = []\n",
-    "\n",
-    "        # GridSearch for hyperparameter tuning\n",
-    "        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n",
-    "        grid_search.fit(X, y)\n",
-    "        best_model = grid_search.best_estimator_\n",
-    "\n",
-    "        # Perform LOOCV\n",
-    "        for train_index, test_index in loo.split(X):\n",
-    "            X_train, X_test = X[train_index], X[test_index]\n",
-    "            y_train, y_test = y[train_index], y[test_index]\n",
-    "\n",
-    "            # Fit and predict with the best model\n",
-    "            best_model.fit(X_train, y_train)\n",
-    "            y_pred = best_model.predict(X_test)\n",
-    "\n",
-    "            # Track results\n",
-    "            fold_scores.append(best_model.score(X_test, y_test))\n",
-    "            all_y_true.extend(y_test)\n",
-    "            all_y_pred.extend(y_pred)\n",
-    "\n",
-    "        # Compute overall statistics\n",
-    "        avg_score = np.mean(fold_scores)\n",
-    "        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n",
-    "\n",
-    "        # Store results\n",
-    "        results[model_name] = {\n",
-    "            'best_params': grid_search.best_params_,\n",
-    "            'avg_loocv_score': avg_score,\n",
-    "            'classification_report': classification_report_dict\n",
-    "        }\n",
-    "\n",
-    "    return results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def save_results_to_csv(results, output_file):\n",
-    "    # Convert the results dictionary into a DataFrame\n",
-    "    print(results)\n",
-    "    rows = []\n",
-    "    for model_name, result in results.items():\n",
-    "        print(result)\n",
-    "        row = {\n",
-    "            'model': model_name[0],\n",
-    "            'pca_size': model_name[1],\n",
-    "            'best_params': result['best_params'],\n",
-    "            'avg loocv score': result['avg_loocv_score'],\n",
-    "            'classification_report': str(result['classification_report'])  # Serialize the report as a string\n",
-    "        }\n",
-    "        rows.append(row)\n",
-    "    \n",
-    "    df = pd.DataFrame(rows)\n",
-    "    df.to_csv(output_file, index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define models and parameter grids\n",
-    "models_to_test = {\n",
-    "    'Random Forest': (RandomForestClassifier(random_state=42), {\n",
-    "        'n_estimators': [50, 100, 200],\n",
-    "        'max_depth': [None, 10, 20]\n",
-    "    }),\n",
-    "    'SVM': (SVC(kernel='linear', random_state=42), {\n",
-    "        'C': [0.1, 1, 10]\n",
-    "    }),\n",
-    "    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n",
-    "        'C': [0.1, 1, 10]\n",
-    "    }),\n",
-    "    'Naive Bayes': (GaussianNB(), {}),\n",
-    "    'KNN': (KNeighborsClassifier(), {\n",
-    "        'n_neighbors': [3, 5, 7, 10],\n",
-    "        'weights': ['uniform', 'distance']\n",
-    "    }),\n",
-    "    'ZeroR': (DummyClassifier(strategy='most_frequent'), {}),  # ZeroR always predicts the most frequent class\n",
-    "\n",
-    "}\n",
-    "\n",
-    "pca_to_test = [0, 10, 44]\n",
-    "all_results = {}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "CHANGE WHAT DATA IS BEING LOADED HERE"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load Data\n",
-    "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n",
-    "cancer_type = \"lung\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running LOOCV with PCA on 0 features...\n",
-      "Training Random Forest with LOOCV...\n",
-      "Training SVM with LOOCV...\n",
-      "Training Logistic Regression with LOOCV...\n",
-      "Training Naive Bayes with LOOCV...\n",
-      "Training KNN with LOOCV...\n",
-      "Training ZeroR with LOOCV...\n",
-      "Running LOOCV with PCA on 10 features...\n",
-      "Training Random Forest with LOOCV...\n",
-      "Training SVM with LOOCV...\n",
-      "Training Logistic Regression with LOOCV...\n",
-      "Training Naive Bayes with LOOCV...\n",
-      "Training KNN with LOOCV...\n",
-      "Training ZeroR with LOOCV...\n",
-      "Running LOOCV with PCA on 44 features...\n",
-      "Training Random Forest with LOOCV...\n",
-      "Training SVM with LOOCV...\n",
-      "Training Logistic Regression with LOOCV...\n",
-      "Training Naive Bayes with LOOCV...\n",
-      "Training KNN with LOOCV...\n",
-      "Training ZeroR with LOOCV...\n",
-      "{('Random Forest', 0): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('SVM', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Logistic Regression', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 0): {'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('KNN', 0): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 0): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 10): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 44): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 44): {'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('KNN', 44): {'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 44): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}}\n",
-      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
-      "{'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
-      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
-      "{'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
-      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n",
-      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n",
-      "{'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n",
-      "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n",
-      "Pipeline complete. Results saved to Result/results_liver_pca_20241125_142350.csv\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "# Preprocess Data\n",
-    "scaler = StandardScaler()\n",
-    "X_scaled = scaler.fit_transform(X)\n",
-    "\n",
-    "\n",
-    "all_results = {}\n",
-    "\n",
-    "for num_pca in pca_to_test:\n",
-    "    print(f\"Running LOOCV with PCA on {num_pca} features...\")\n",
-    "    # Apply PCA if specified\n",
-    "    if num_pca == 0:\n",
-    "        X_pca = X_scaled\n",
-    "    else:\n",
-    "        X_pca, pca = pca_data(X_scaled, num_pca)\n",
-    "\n",
-    "    # Train and evaluate models using LOOCV\n",
-    "    model_results = train_and_evaluate_loocv(X_pca, y, models_to_test)\n",
-    "    all_results.update({(model_name, num_pca): result for model_name, result in model_results.items()})\n",
-    "\n",
-    "    \n",
-    "# save total results\n",
-    "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n",
-    "save_results_to_csv(results=all_results, output_file=csv_filename)\n",
-    "\n",
-    "print(\"Pipeline complete. Results saved to \" + csv_filename)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/train_and_evaluate.ipynb b/train_and_evaluate.ipynb
new file mode 100644
index 0000000..2582c97
--- /dev/null
+++ b/train_and_evaluate.ipynb
@@ -0,0 +1,363 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.dummy import DummyClassifier\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.model_selection import LeaveOneOut\n",
+    "from datetime import datetime\n",
+    "import os\n",
+    "import joblib\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_combine_datasets(control_path, cancer_path):\n",
+    "    # Load datasets\n",
+    "    control_df = pd.read_csv(control_path)\n",
+    "    cancer_df = pd.read_csv(cancer_path)\n",
+    "    \n",
+    "    # Determine the size for undersampling\n",
+    "    sample_size = len(cancer_df)\n",
+    "    \n",
+    "    # Undersample the healthy dataset\n",
+    "    control_df_sample = control_df.sample(n=sample_size)\n",
+    "    \n",
+    "    # Combine datasets and shuffle\n",
+    "    combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)\n",
+    "    \n",
+    "    # Split into features and labels\n",
+    "    X = combined_df.drop(['cancer_type', 'type'], axis=1)\n",
+    "    y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0)  # 1 = cancer, 0 = healthy\n",
+    "    \n",
+    "    return X, y\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pca_data(X, n_components):\n",
+    "    # Apply PCA (assumes X is already standardized)\n",
+    "    pca = PCA(n_components=n_components)\n",
+    "    X_pca = pca.fit_transform(X)\n",
+    "    return X_pca, pca"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_model(model, model_name, cancer_type, folder=\"Models\"):\n",
+    "    \"\"\"Save the model to a file using joblib.\"\"\"\n",
+    "    if not os.path.exists(folder):\n",
+    "        os.makedirs(folder)  # Create directory if it doesn't exist\n",
+    "    model_filename = f\"{folder}/{model_name}_{cancer_type}.joblib\"\n",
+    "    joblib.dump(model, model_filename)\n",
+    "    print(f\"Model {model_name} saved to {model_filename}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train and evaluate models using LOOCV\n",
+    "def train_and_evaluate_loocv(X, y, models, cancer_type):\n",
+    "    loo = LeaveOneOut()\n",
+    "    results = {}\n",
+    "\n",
+    "    for model_name, (model, param_grid) in models.items():\n",
+    "        print(f\"Training {model_name} with LOOCV...\")\n",
+    "        fold_scores = []\n",
+    "        all_y_true = []\n",
+    "        all_y_pred = []\n",
+    "\n",
+    "        # GridSearch for hyperparameter tuning\n",
+    "        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n",
+    "        grid_search.fit(X, y)\n",
+    "        best_model = grid_search.best_estimator_\n",
+    "        # Save the best model\n",
+    "        save_model(best_model, model_name, cancer_type)\n",
+    "\n",
+    "        # Perform LOOCV\n",
+    "        for train_index, test_index in loo.split(X):\n",
+    "            X_train, X_test = X[train_index], X[test_index]\n",
+    "            y_train, y_test = y[train_index], y[test_index]\n",
+    "\n",
+    "            # Fit and predict with the best model\n",
+    "            best_model.fit(X_train, y_train)\n",
+    "            y_pred = best_model.predict(X_test)\n",
+    "\n",
+    "            # Track results\n",
+    "            fold_scores.append(best_model.score(X_test, y_test))\n",
+    "            all_y_true.extend(y_test)\n",
+    "            all_y_pred.extend(y_pred)\n",
+    "\n",
+    "        # Compute overall statistics\n",
+    "        avg_score = np.mean(fold_scores)\n",
+    "        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n",
+    "\n",
+    "        # Store results\n",
+    "        results[model_name] = {\n",
+    "            'best_params': grid_search.best_params_,\n",
+    "            'avg_loocv_score': avg_score,\n",
+    "            'classification_report': classification_report_dict\n",
+    "        }\n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_results_to_csv(results, output_file):\n",
+    "    # Convert the results dictionary into a DataFrame\n",
+    "    print(results)\n",
+    "    rows = []\n",
+    "    for model_name, result in results.items():\n",
+    "        print(result)\n",
+    "        row = {\n",
+    "            'model': model_name,\n",
+    "            'best_params': result['best_params'],\n",
+    "            'avg loocv score': result['avg_loocv_score'],\n",
+    "            'classification_report': str(result['classification_report'])  # Serialize the report as a string\n",
+    "        }\n",
+    "        rows.append(row)\n",
+    "    \n",
+    "    df = pd.DataFrame(rows)\n",
+    "    df.to_csv(output_file, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define models and parameter grids\n",
+    "models_to_test = {\n",
+    "    'Random Forest': (RandomForestClassifier(random_state=42), {\n",
+    "        'n_estimators': [50, 100],\n",
+    "        'max_depth': [None, 10]\n",
+    "    }),\n",
+    "    # 'SVM': (SVC(kernel='linear', random_state=42), {\n",
+    "    #     'C': [0.1, 1, 10]\n",
+    "    # }),\n",
+    "    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n",
+    "        'C': [0.1, 1]\n",
+    "    }),\n",
+    "    # 'Naive Bayes': (GaussianNB(), {}),\n",
+    "    # 'KNN': (KNeighborsClassifier(), {\n",
+    "    #     'n_neighbors': [3, 5, 7, 10],\n",
+    "    #     'weights': ['uniform', 'distance']\n",
+    "    # }),\n",
+    "    # 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}),  # ZeroR always predicts the most frequent class\n",
+    "\n",
+    "}\n",
+    "\n",
+    "all_results = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing datasets: Dataset/throat.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_throat.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_throat.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_throat_pca_20241130_210102.csv\n",
+      "Processing datasets: Dataset/prostate.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_prostate.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_prostate.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}\n",
+      "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_prostate_pca_20241130_210230.csv\n",
+      "Processing datasets: Dataset/leukemia.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_leukemia.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_leukemia.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_leukemia_pca_20241130_210320.csv\n",
+      "Processing datasets: Dataset/lung.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_lung.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_lung.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_lung_pca_20241130_211200.csv\n",
+      "Processing datasets: Dataset/colorectal.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_colorectal.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_colorectal.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_colorectal_pca_20241130_212335.csv\n",
+      "Processing datasets: Dataset/liver.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_liver.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_liver.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_liver_pca_20241130_212555.csv\n",
+      "Processing datasets: Dataset/renal.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_renal.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_renal.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}\n",
+      "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_renal_pca_20241130_213009.csv\n",
+      "Processing datasets: Dataset/bladder.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_bladder.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_bladder.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_bladder_pca_20241130_213111.csv\n",
+      "Processing datasets: Dataset/gastric.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_gastric.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_gastric.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}\n",
+      "{'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_gastric_pca_20241130_213147.csv\n",
+      "Processing datasets: Dataset/brain.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_brain.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_brain.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n",
+      "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_brain_pca_20241130_213953.csv\n",
+      "Processing datasets: Dataset/breast.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_breast.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_breast.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_breast_pca_20241130_221107.csv\n",
+      "Processing datasets: Dataset/pancreatic.csv\n",
+      "Training Random Forest with LOOCV...\n",
+      "Model Random Forest saved to Models/Random Forest_pancreatic.joblib\n",
+      "Training Logistic Regression with LOOCV...\n",
+      "Model Logistic Regression saved to Models/Logistic Regression_pancreatic.joblib\n",
+      "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}}\n",
+      "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n",
+      "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n",
+      "Pipeline complete. Results saved to Result/results_pancreatic_pca_20241130_221213.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "dataset_files = [f for f in os.listdir(\"Dataset\") if f.endswith('.csv')]\n",
+    "for dataset_file in dataset_files:\n",
+    "    file_path = os.path.join(\"Dataset\", dataset_file)\n",
+    "\n",
+    "    cancer_type = os.path.splitext(file_path)[0].split('/')[-1]\n",
+    "    print(f\"Processing datasets: {file_path}\")\n",
+    "\n",
+    "    X, y = load_and_combine_datasets(\"ControlDataset/normal.csv\", file_path)\n",
+    "\n",
+    "    scaler = StandardScaler()\n",
+    "    X_scaled = scaler.fit_transform(X)\n",
+    "    joblib.dump(scaler, f\"Scalers/{cancer_type}_scaler.joblib\")\n",
+    "\n",
+    "\n",
+    "    all_results = {}\n",
+    "\n",
+    "\n",
+    "    # Train and evaluate models using LOOCV\n",
+    "    model_results = train_and_evaluate_loocv(X_scaled, y, models_to_test, cancer_type)\n",
+    "    all_results.update({model_name: result for model_name, result in model_results.items()})\n",
+    "\n",
+    "        \n",
+    "    # save total results\n",
+    "    csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n",
+    "    save_results_to_csv(results=all_results, output_file=csv_filename)\n",
+    "\n",
+    "    print(\"Pipeline complete. Results saved to \" + csv_filename)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 99c09eeb25a7c0cb31fdce96a12021728e279a87 Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Sun, 1 Dec 2024 11:36:54 -0500
Subject: [PATCH 5/7] added way to visualize most impactful logistic weights

---
 .gitignore                     |   4 +-
 visualize_logistic_model.ipynb | 196 +++++++++++++++++++++++++++++++++
 2 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 visualize_logistic_model.ipynb

diff --git a/.gitignore b/.gitignore
index e5dd687..b0f7530 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ NotInUseModels/*
 Result/*
 Scalers/*
 TestDataset/*
-inference_results.csv
\ No newline at end of file
+inference_results.csv
+FeatureWeights/*
+CombinedFeatureWeights.csv
\ No newline at end of file
diff --git a/visualize_logistic_model.ipynb b/visualize_logistic_model.ipynb
new file mode 100644
index 0000000..0a4acff
--- /dev/null
+++ b/visualize_logistic_model.ipynb
@@ -0,0 +1,196 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define paths\n",
+    "models_folder = \"Models\"  # Path to the folder containing model files\n",
+    "test_data_path = \"TestDataset/test_data.csv\"  # Path to test data\n",
+    "output_folder = \"FeatureWeights\"  # Folder to save output CSVs\n",
+    "os.makedirs(output_folder, exist_ok=True)  # Create output folder if it doesn't exist\n",
+    "\n",
+    "# Load feature names from the test data\n",
+    "feature_names = pd.read_csv(test_data_path, nrows=0).columns.tolist()[2:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing model: Logistic Regression_pancreatic.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_pancreatic_weights.csv\n",
+      "Processing model: Logistic Regression_brain.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_brain_weights.csv\n",
+      "Processing model: Logistic Regression_leukemia.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_leukemia_weights.csv\n",
+      "Processing model: Logistic Regression_gastric.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_gastric_weights.csv\n",
+      "Processing model: Logistic Regression_colorectal.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_colorectal_weights.csv\n",
+      "Processing model: Logistic Regression_lung.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_lung_weights.csv\n",
+      "Processing model: Logistic Regression_breast.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_breast_weights.csv\n",
+      "Processing model: Logistic Regression_renal.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_renal_weights.csv\n",
+      "Processing model: Logistic Regression_liver.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_liver_weights.csv\n",
+      "Processing model: Logistic Regression_bladder.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_bladder_weights.csv\n",
+      "Processing model: Logistic Regression_throat.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_throat_weights.csv\n",
+      "Processing model: Logistic Regression_prostate.joblib\n",
+      "Saved feature weights to: FeatureWeights/Logistic Regression_prostate_weights.csv\n",
+      "Processing complete. All feature weights saved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Iterate through all joblib files in the Models folder\n",
+    "for model_file in os.listdir(models_folder):\n",
+    "    if model_file.endswith(\".joblib\"):\n",
+    "        model_path = os.path.join(models_folder, model_file)\n",
+    "        print(f\"Processing model: {model_file}\")\n",
+    "        \n",
+    "        # Load the logistic regression model\n",
+    "        model = joblib.load(model_path)\n",
+    "\n",
+    "        # Extract coefficients\n",
+    "        coefficients = model.coef_[0]  # Assuming binary classification; modify for multi-class\n",
+    "        \n",
+    "        # Pair feature names with coefficients\n",
+    "        feature_weights = pd.DataFrame({\n",
+    "            \"Feature\": feature_names,\n",
+    "            \"Weight\": coefficients\n",
+    "        })\n",
+    "\n",
+    "        # Sort by weight\n",
+    "        feature_weights = feature_weights.sort_values(by=\"Weight\", ascending=False)\n",
+    "\n",
+    "        # Save to CSV\n",
+    "        output_file = os.path.join(output_folder, f\"{os.path.splitext(model_file)[0]}_weights.csv\")\n",
+    "        feature_weights.to_csv(output_file, index=False)\n",
+    "\n",
+    "        print(f\"Saved feature weights to: {output_file}\")\n",
+    "\n",
+    "print(\"Processing complete. All feature weights saved.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def combine_feature_weights(feature_weights_folder, output_file):\n",
+    "    combined_data = []  # To store all rows of the final table\n",
+    "\n",
+    "    # Iterate through all CSV files in the FeatureWeights folder\n",
+    "    for file_name in os.listdir(feature_weights_folder):\n",
+    "        if file_name.endswith(\"_weights.csv\"):\n",
+    "            cancer_type = file_name.split(\"_weights.csv\")[0]  # Extract cancer type from the file name\n",
+    "            \n",
+    "            # Load feature weights\n",
+    "            file_path = os.path.join(feature_weights_folder, file_name)\n",
+    "            df = pd.read_csv(file_path)\n",
+    "            \n",
+    "            # Get top 5 positive and negative weights\n",
+    "            top_positive = df.nlargest(5, \"Weight\").reset_index(drop=True)\n",
+    "            top_negative = df.nsmallest(5, \"Weight\").reset_index(drop=True)\n",
+    "            \n",
+    "            # Prepare a row for this cancer type\n",
+    "            row = {\"Cancer Type\": cancer_type}\n",
+    "            for i in range(5):  # Add up to 5 features and weights\n",
+    "                row[f\"Positive Feature {i+1}\"] = (\n",
+    "                    top_positive.at[i, \"Feature\"] if i < len(top_positive) else \"\"\n",
+    "                )\n",
+    "\n",
+    "            for i in range(5):\n",
+    "                row[f\"Negative Feature {i+1}\"] = (\n",
+    "                    top_negative.at[i, \"Feature\"] if i < len(top_negative) else \"\"\n",
+    "                )\n",
+    "            \n",
+    "            # Add weights in separate columns\n",
+    "            for i in range(5):\n",
+    "                row[f\"Positive Weight {i+1}\"] = (\n",
+    "                    top_positive.at[i, \"Weight\"] if i < len(top_positive) else \"\"\n",
+    "                )\n",
+    "                \n",
+    "            for i in range(5):\n",
+    "                row[f\"Negative Weight {i+1}\"] = (\n",
+    "                    top_negative.at[i, \"Weight\"] if i < len(top_negative) else \"\"\n",
+    "                )\n",
+    "            combined_data.append(row)\n",
+    "    \n",
+    "    # Create a DataFrame for the combined data\n",
+    "    combined_df = pd.DataFrame(combined_data)\n",
+    "    \n",
+    "    # Save to a CSV file\n",
+    "    combined_df.to_csv(output_file, index=False)\n",
+    "    print(f\"Combined feature weights saved to: {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Combined feature weights saved to: CombinedFeatureWeights.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Specify input and output paths\n",
+    "feature_weights_folder = \"FeatureWeights\"  # Folder containing individual feature weights CSVs\n",
+    "output_file = \"CombinedFeatureWeights.csv\"  # Output file for the combined results\n",
+    "\n",
+    "# Call the function\n",
+    "combine_feature_weights(feature_weights_folder, output_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From a09b8c85ab9a27ffc3f371558a900947d5411116 Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Mon, 2 Dec 2024 18:44:03 -0500
Subject: [PATCH 6/7] reorganized baseline pipeline

---
 ...pynb => baseline_aggregate_and_infer.ipynb |  7 ++---
 ...ipynb => baseline_train_and_evaluate.ipynb | 26 +++++--------------
 ...=> baseline_visualize_logistic_model.ipynb | 25 +++++++-----------
 3 files changed, 17 insertions(+), 41 deletions(-)
 rename aggregate_and_infer.ipynb => baseline_aggregate_and_infer.ipynb (96%)
 rename train_and_evaluate.ipynb => baseline_train_and_evaluate.ipynb (98%)
 rename visualize_logistic_model.ipynb => baseline_visualize_logistic_model.ipynb (89%)

diff --git a/aggregate_and_infer.ipynb b/baseline_aggregate_and_infer.ipynb
similarity index 96%
rename from aggregate_and_infer.ipynb
rename to baseline_aggregate_and_infer.ipynb
index dbaf357..04ce609 100644
--- a/aggregate_and_infer.ipynb
+++ b/baseline_aggregate_and_infer.ipynb
@@ -24,7 +24,6 @@
     "    models = {}\n",
     "    for file_name in os.listdir(models_folder):\n",
     "        if file_name.endswith(\".joblib\"):\n",
-    "            # Extract model name and cancer type from the filename\n",
     "            model_name, cancer_type = file_name.split(\"_\")\n",
     "            model_path = os.path.join(models_folder, file_name)\n",
     "            models[(model_name, cancer_type)] = joblib.load(model_path)\n",
@@ -42,11 +41,9 @@
     "    \"\"\"Load and preprocess new data.\"\"\"\n",
     "    # Load the data\n",
     "    data = pd.read_csv(new_data_path)\n",
-    "    \n",
-    "    # Drop unnecessary columns (adjust this based on your dataset structure)\n",
     "    X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')\n",
     "    \n",
-    "    # Standardize the data\n",
+    "    # standardize the data\n",
     "    if scaler is None:\n",
     "        scaler = StandardScaler()\n",
     "        X_scaled = scaler.fit_transform(X)\n",
@@ -65,7 +62,7 @@
     "def run_inference(models, test_data_path, scalers_folder=\"Scalers\"):\n",
     "    # Load the test data\n",
     "    test_df = pd.read_csv(test_data_path)\n",
-    "    test_features = test_df.drop(['cancer_type', 'type'], axis=1)  # Drop label columns if they exist\n",
+    "    test_features = test_df.drop(['cancer_type', 'type'], axis=1)\n",
     "\n",
     "    # List all scaler files in the Scalers folder\n",
     "    scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]\n",
diff --git a/train_and_evaluate.ipynb b/baseline_train_and_evaluate.ipynb
similarity index 98%
rename from train_and_evaluate.ipynb
rename to baseline_train_and_evaluate.ipynb
index 2582c97..d9d3440 100644
--- a/train_and_evaluate.ipynb
+++ b/baseline_train_and_evaluate.ipynb
@@ -55,27 +55,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pca_data(X, n_components):\n",
-    "    # Apply PCA (assumes X is already standardized)\n",
-    "    pca = PCA(n_components=n_components)\n",
-    "    X_pca = pca.fit_transform(X)\n",
-    "    return X_pca, pca"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def save_model(model, model_name, cancer_type, folder=\"Models\"):\n",
     "    \"\"\"Save the model to a file using joblib.\"\"\"\n",
     "    if not os.path.exists(folder):\n",
-    "        os.makedirs(folder)  # Create directory if it doesn't exist\n",
+    "        os.makedirs(folder)\n",
     "    model_filename = f\"{folder}/{model_name}_{cancer_type}.joblib\"\n",
     "    joblib.dump(model, model_filename)\n",
     "    print(f\"Model {model_name} saved to {model_filename}\")"
@@ -83,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,12 +92,11 @@
     "        # Save the best model\n",
     "        save_model(best_model, model_name, cancer_type)\n",
     "\n",
-    "        # Perform LOOCV\n",
+    "        # perform LOOCV\n",
     "        for train_index, test_index in loo.split(X):\n",
     "            X_train, X_test = X[train_index], X[test_index]\n",
     "            y_train, y_test = y[train_index], y[test_index]\n",
     "\n",
-    "            # Fit and predict with the best model\n",
     "            best_model.fit(X_train, y_train)\n",
     "            y_pred = best_model.predict(X_test)\n",
     "\n",
@@ -119,7 +105,7 @@
     "            all_y_true.extend(y_test)\n",
     "            all_y_pred.extend(y_pred)\n",
     "\n",
-    "        # Compute overall statistics\n",
+    "        # overall stats\n",
     "        avg_score = np.mean(fold_scores)\n",
     "        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n",
     "\n",
@@ -159,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/visualize_logistic_model.ipynb b/baseline_visualize_logistic_model.ipynb
similarity index 89%
rename from visualize_logistic_model.ipynb
rename to baseline_visualize_logistic_model.ipynb
index 0a4acff..4c90610 100644
--- a/visualize_logistic_model.ipynb
+++ b/baseline_visualize_logistic_model.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,15 +21,14 @@
     "models_folder = \"Models\"  # Path to the folder containing model files\n",
     "test_data_path = \"TestDataset/test_data.csv\"  # Path to test data\n",
     "output_folder = \"FeatureWeights\"  # Folder to save output CSVs\n",
-    "os.makedirs(output_folder, exist_ok=True)  # Create output folder if it doesn't exist\n",
+    "os.makedirs(output_folder, exist_ok=True)\n",
     "\n",
-    "# Load feature names from the test data\n",
     "feature_names = pd.read_csv(test_data_path, nrows=0).columns.tolist()[2:]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -71,11 +70,10 @@
     "        model_path = os.path.join(models_folder, model_file)\n",
     "        print(f\"Processing model: {model_file}\")\n",
     "        \n",
-    "        # Load the logistic regression model\n",
     "        model = joblib.load(model_path)\n",
     "\n",
     "        # Extract coefficients\n",
-    "        coefficients = model.coef_[0]  # Assuming binary classification; modify for multi-class\n",
+    "        coefficients = model.coef_[0]\n",
     "        \n",
     "        # Pair feature names with coefficients\n",
     "        feature_weights = pd.DataFrame({\n",
@@ -83,10 +81,8 @@
     "            \"Weight\": coefficients\n",
     "        })\n",
     "\n",
-    "        # Sort by weight\n",
     "        feature_weights = feature_weights.sort_values(by=\"Weight\", ascending=False)\n",
     "\n",
-    "        # Save to CSV\n",
     "        output_file = os.path.join(output_folder, f\"{os.path.splitext(model_file)[0]}_weights.csv\")\n",
     "        feature_weights.to_csv(output_file, index=False)\n",
     "\n",
@@ -107,7 +103,7 @@
     "    # Iterate through all CSV files in the FeatureWeights folder\n",
     "    for file_name in os.listdir(feature_weights_folder):\n",
     "        if file_name.endswith(\"_weights.csv\"):\n",
-    "            cancer_type = file_name.split(\"_weights.csv\")[0]  # Extract cancer type from the file name\n",
+    "            cancer_type = file_name.split(\"_weights.csv\")[0]\n",
     "            \n",
     "            # Load feature weights\n",
     "            file_path = os.path.join(feature_weights_folder, file_name)\n",
@@ -141,17 +137,15 @@
     "                )\n",
     "            combined_data.append(row)\n",
     "    \n",
-    "    # Create a DataFrame for the combined data\n",
-    "    combined_df = pd.DataFrame(combined_data)\n",
-    "    \n",
     "    # Save to a CSV file\n",
+    "    combined_df = pd.DataFrame(combined_data)\n",
     "    combined_df.to_csv(output_file, index=False)\n",
     "    print(f\"Combined feature weights saved to: {output_file}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -164,10 +158,9 @@
    ],
    "source": [
     "# Specify input and output paths\n",
-    "feature_weights_folder = \"FeatureWeights\"  # Folder containing individual feature weights CSVs\n",
-    "output_file = \"CombinedFeatureWeights.csv\"  # Output file for the combined results\n",
+    "feature_weights_folder = \"FeatureWeights\"\n",
+    "output_file = \"CombinedFeatureWeights.csv\" \n",
     "\n",
-    "# Call the function\n",
     "combine_feature_weights(feature_weights_folder, output_file)"
    ]
   }

From 9390fb75cf839061038a52ae4a6cf6494f4be9df Mon Sep 17 00:00:00 2001
From: wyatt522 <harrisw522@gmail.com>
Date: Mon, 2 Dec 2024 18:49:22 -0500
Subject: [PATCH 7/7] reorganized files

---
 .../baseline_aggregate_and_infer.ipynb                            | 0
 .../baseline_train_and_evaluate.ipynb                             | 0
 .../baseline_visualize_logistic_model.ipynb                       | 0
 clustering.ipynb => Clustering/clustering.ipynb                   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename baseline_aggregate_and_infer.ipynb => Baseline/baseline_aggregate_and_infer.ipynb (100%)
 rename baseline_train_and_evaluate.ipynb => Baseline/baseline_train_and_evaluate.ipynb (100%)
 rename baseline_visualize_logistic_model.ipynb => Baseline/baseline_visualize_logistic_model.ipynb (100%)
 rename clustering.ipynb => Clustering/clustering.ipynb (100%)

diff --git a/baseline_aggregate_and_infer.ipynb b/Baseline/baseline_aggregate_and_infer.ipynb
similarity index 100%
rename from baseline_aggregate_and_infer.ipynb
rename to Baseline/baseline_aggregate_and_infer.ipynb
diff --git a/baseline_train_and_evaluate.ipynb b/Baseline/baseline_train_and_evaluate.ipynb
similarity index 100%
rename from baseline_train_and_evaluate.ipynb
rename to Baseline/baseline_train_and_evaluate.ipynb
diff --git a/baseline_visualize_logistic_model.ipynb b/Baseline/baseline_visualize_logistic_model.ipynb
similarity index 100%
rename from baseline_visualize_logistic_model.ipynb
rename to Baseline/baseline_visualize_logistic_model.ipynb
diff --git a/clustering.ipynb b/Clustering/clustering.ipynb
similarity index 100%
rename from clustering.ipynb
rename to Clustering/clustering.ipynb