From c7c87cb878008d7a59414dcbfe3d3fdbf34b12a5 Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Mon, 18 Nov 2024 20:23:12 -0500 Subject: [PATCH 1/7] made dataset to model produce csv file --- datasets_to_model.ipynb | 70 ++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb index def1a8d..1c1668a 100644 --- a/datasets_to_model.ipynb +++ b/datasets_to_model.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,10 +61,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "# Train and evaluate models\n", "def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, models):\n", " results = {}\n", " \n", @@ -73,14 +74,13 @@ " grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n", " grid_search.fit(X_train, y_train)\n", " \n", - " # Save best parameters and performance\n", " best_model = grid_search.best_estimator_\n", " train_score = grid_search.best_score_\n", " val_score = best_model.score(X_val, y_val)\n", " test_score = best_model.score(X_test, y_test)\n", " y_pred = best_model.predict(X_test)\n", - " \n", - " results[model_name] = {\n", + "\n", + " results[(model_name, X_train.shape[1])] = {\n", " 'best_params': grid_search.best_params_,\n", " 'train_score': train_score,\n", " 'val_score': val_score,\n", @@ -93,18 +93,33 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def save_results(results, output_file):\n", - " with open(output_file, 'w') as f:\n", - " json.dump(results, f, indent=4)" + "def save_results_to_csv(results, output_file):\n", + " # Convert the results dictionary into a DataFrame\n", + " print(results)\n", + " rows = []\n", + " for model_name, result in results.items():\n", + " row = {\n", + " 'model': model_name[0],\n", + " 'pca_size': model_name[1],\n", + " 'best_params': result['best_params'],\n", + " 'train_score': result['train_score'],\n", + " 'val_score': result['val_score'],\n", + " 'test_score': result['test_score'],\n", + " 'classification_report': str(result['classification_report']) # Serialize the report as a string\n", + " }\n", + " rows.append(row)\n", + " \n", + " df = pd.DataFrame(rows)\n", + " df.to_csv(output_file, index=False)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -123,12 +138,13 @@ " })\n", "}\n", "\n", - "pca_to_test = [0, 10, 100, 160]" + "pca_to_test = [0, 10, 100, 160]\n", + "all_results = {}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +178,19 @@ "Training Random Forest...\n", "Training SVM...\n", "Training Logistic Regression...\n", - "Pipeline complete. Results saved to 'results.json'.\n" + "{'Random Forest': {}, ('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9813650128115537, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'SVM': {}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'Logistic Regression': {}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.9442115071045888, 'val_score': 0.9705882352941176, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 10): {'best_params': {'C': 1, 'kernel': 'rbf'}, 'train_score': 0.9437456324248776, 'val_score': 0.9705882352941176, 'test_score': 0.9714285714285714, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9444444444444444, 'f1-score': 0.9714285714285714, 'support': 18.0}, '1': {'precision': 0.9444444444444444, 'recall': 1.0, 'f1-score': 0.9714285714285714, 'support': 17.0}, 'accuracy': 0.9714285714285714, 'macro avg': {'precision': 0.9722222222222222, 'recall': 0.9722222222222222, 'f1-score': 0.9714285714285714, 'support': 35.0}, 'weighted avg': {'precision': 0.9730158730158731, 'recall': 0.9714285714285714, 'f1-score': 0.9714285714285714, 'support': 35.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 1}, 'train_score': 0.9379221989284883, 'val_score': 1.0, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9444444444444444, 'recall': 0.9444444444444444, 'f1-score': 0.9444444444444444, 'support': 18.0}, '1': {'precision': 0.9411764705882353, 'recall': 0.9411764705882353, 'f1-score': 0.9411764705882353, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.9428104575163399, 'recall': 0.9428104575163399, 'f1-score': 0.9428104575163399, 'support': 35.0}, 'weighted avg': {'precision': 0.9428571428571428, 'recall': 0.9428571428571428, 'f1-score': 0.9428571428571428, 'support': 35.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9443279757745167, 'val_score': 0.9411764705882353, 'test_score': 0.9142857142857143, 'classification_report': {'0': {'precision': 0.8571428571428571, 'recall': 1.0, 'f1-score': 0.923076923076923, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8235294117647058, 'f1-score': 0.9032258064516129, 'support': 17.0}, 'accuracy': 0.9142857142857143, 'macro avg': {'precision': 0.9285714285714286, 'recall': 0.9117647058823529, 'f1-score': 0.913151364764268, 'support': 35.0}, 'weighted avg': {'precision': 0.926530612244898, 'recall': 0.9142857142857143, 'f1-score': 0.9134349521446294, 'support': 35.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9753086419753086, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.9876543209876543, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 160): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9318658280922433, 'val_score': 0.9411764705882353, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 160): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 160): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}}\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'best_params'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 32\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# save total resutls\u001b[39;00m\n\u001b[1;32m 31\u001b[0m csv_filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResult/results_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcancer_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_pca_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdatetime\u001b[38;5;241m.\u001b[39mnow()\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mS\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[43msave_results_to_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcsv_filename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline complete. Results saved to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults.json\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[11], line 10\u001b[0m, in \u001b[0;36msave_results_to_csv\u001b[0;34m(results, output_file)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m model_name, result \u001b[38;5;129;01min\u001b[39;00m results\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m pca_size, performance \u001b[38;5;129;01min\u001b[39;00m result\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 7\u001b[0m row \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m: model_name,\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpca_size\u001b[39m\u001b[38;5;124m'\u001b[39m: pca_size,\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbest_params\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mperformance\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbest_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mstr\u001b[39m(performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;66;03m# Serialize the report as a string\u001b[39;00m\n\u001b[1;32m 15\u001b[0m }\n\u001b[1;32m 16\u001b[0m rows\u001b[38;5;241m.\u001b[39mappend(row)\n\u001b[1;32m 18\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(rows)\n", + "\u001b[0;31mKeyError\u001b[0m: 'best_params'" ] } ], @@ -178,6 +206,7 @@ "X_val_scaled = scaler.transform(X_val)\n", "X_test_scaled = scaler.transform(X_test)\n", "\n", + "\n", "for num_pca in pca_to_test:\n", " print(f\"running pca on {num_pca} features\")\n", " # Run PCA feature reduction\n", @@ -191,11 +220,14 @@ " X_test_pca = pca.transform(X_test_scaled)\n", "\n", " # Train and evaluate models\n", - " results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n", + " model_results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n", "\n", - " # Save results\n", - " filename = f\"Result/results_{cancer_type}_pca_{num_pca}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n", - " save_results(results, filename)\n", + " # Update results\n", + " all_results.update(model_results)\n", + " \n", + "# save total resutls\n", + "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", + "save_results_to_csv(results=all_results, output_file=csv_filename)\n", "\n", "print(\"Pipeline complete. Results saved to 'results.json'.\")" ] From a4436b07f1880ea5d61b89b9e0cfb380b54a7f99 Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Tue, 19 Nov 2024 16:09:21 -0500 Subject: [PATCH 2/7] added more models --- datasets_to_model.ipynb | 122 +++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 26 deletions(-) diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb index 1c1668a..c977fc2 100644 --- a/datasets_to_model.ipynb +++ b/datasets_to_model.ipynb @@ -13,6 +13,9 @@ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.dummy import DummyClassifier\n", "from sklearn.metrics import classification_report\n", "from sklearn.preprocessing import StandardScaler\n", "import json\n", @@ -61,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -93,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -119,15 +122,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Define models and parameter grids\n", "models_to_test = {\n", " 'Random Forest': (RandomForestClassifier(random_state=42), {\n", - " 'n_estimators': [50, 100],#, 200],\n", - " 'max_depth': [None, 10],#, 20]\n", + " 'n_estimators': [50, 100, 200],\n", + " 'max_depth': [None, 10, 20]\n", " }),\n", " 'SVM': (SVC(random_state=42), {\n", " 'C': [0.1, 1, 10],\n", @@ -135,22 +138,47 @@ " }),\n", " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500), {\n", " 'C': [0.1, 1, 10]\n", - " })\n", + " }),\n", + " 'Naive Bayes': (GaussianNB(), {}),\n", + " 'KNN': (KNeighborsClassifier(), {\n", + " 'n_neighbors': [3, 5, 7, 10],\n", + " 'weights': ['uniform', 'distance']\n", + " }),\n", + " 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}), # ZeroR always predicts the most frequent class\n", + "\n", "}\n", "\n", - "pca_to_test = [0, 10, 100, 160]\n", + "pca_to_test = [0, 10, 100]\n", "all_results = {}" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'Dataset/pancreastic.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Load Data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_combine_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/normal.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/pancreastic.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m cancer_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpancreatic\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "Cell \u001b[0;32mIn[2], line 4\u001b[0m, in \u001b[0;36mload_and_combine_datasets\u001b[0;34m(control_path, cancer_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_and_combine_datasets\u001b[39m(control_path, cancer_path):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Load datasets\u001b[39;00m\n\u001b[1;32m 3\u001b[0m control_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(control_path)\n\u001b[0;32m----> 4\u001b[0m cancer_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcancer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Determine the size for undersampling\u001b[39;00m\n\u001b[1;32m 7\u001b[0m sample_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(cancer_df)\n", + "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1663\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1664\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1668\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1669\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Dataset/pancreastic.csv'" + ] + } + ], "source": [ "# Load Data\n", - "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n", - "cancer_type = \"lung\"" + "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/pancreatic.csv\")\n", + "cancer_type = \"pancreatic\"" ] }, { @@ -166,31 +194,73 @@ "Training Random Forest...\n", "Training SVM...\n", "Training Logistic Regression...\n", + "Training Naive Bayes...\n", + "Training KNN...\n", + "Training ZeroR...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "running pca on 10 features\n", "Training Random Forest...\n", "Training SVM...\n", "Training Logistic Regression...\n", + "Training Naive Bayes...\n", + "Training KNN...\n", + "Training ZeroR...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "running pca on 100 features\n", "Training Random Forest...\n", "Training SVM...\n", "Training Logistic Regression...\n", - "running pca on 160 features\n", - "Training Random Forest...\n", - "Training SVM...\n", - "Training Logistic Regression...\n", - "{'Random Forest': {}, ('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9813650128115537, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'SVM': {}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, 'Logistic Regression': {}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.9442115071045888, 'val_score': 0.9705882352941176, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 10): {'best_params': {'C': 1, 'kernel': 'rbf'}, 'train_score': 0.9437456324248776, 'val_score': 0.9705882352941176, 'test_score': 0.9714285714285714, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9444444444444444, 'f1-score': 0.9714285714285714, 'support': 18.0}, '1': {'precision': 0.9444444444444444, 'recall': 1.0, 'f1-score': 0.9714285714285714, 'support': 17.0}, 'accuracy': 0.9714285714285714, 'macro avg': {'precision': 0.9722222222222222, 'recall': 0.9722222222222222, 'f1-score': 0.9714285714285714, 'support': 35.0}, 'weighted avg': {'precision': 0.9730158730158731, 'recall': 0.9714285714285714, 'f1-score': 0.9714285714285714, 'support': 35.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 1}, 'train_score': 0.9379221989284883, 'val_score': 1.0, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9444444444444444, 'recall': 0.9444444444444444, 'f1-score': 0.9444444444444444, 'support': 18.0}, '1': {'precision': 0.9411764705882353, 'recall': 0.9411764705882353, 'f1-score': 0.9411764705882353, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.9428104575163399, 'recall': 0.9428104575163399, 'f1-score': 0.9428104575163399, 'support': 35.0}, 'weighted avg': {'precision': 0.9428571428571428, 'recall': 0.9428571428571428, 'f1-score': 0.9428571428571428, 'support': 35.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9443279757745167, 'val_score': 0.9411764705882353, 'test_score': 0.9142857142857143, 'classification_report': {'0': {'precision': 0.8571428571428571, 'recall': 1.0, 'f1-score': 0.923076923076923, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8235294117647058, 'f1-score': 0.9032258064516129, 'support': 17.0}, 'accuracy': 0.9142857142857143, 'macro avg': {'precision': 0.9285714285714286, 'recall': 0.9117647058823529, 'f1-score': 0.913151364764268, 'support': 35.0}, 'weighted avg': {'precision': 0.926530612244898, 'recall': 0.9142857142857143, 'f1-score': 0.9134349521446294, 'support': 35.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9753086419753086, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.9876543209876543, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Random Forest', 160): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.9318658280922433, 'val_score': 0.9411764705882353, 'test_score': 0.9428571428571428, 'classification_report': {'0': {'precision': 0.9, 'recall': 1.0, 'f1-score': 0.9473684210526316, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 0.8823529411764706, 'f1-score': 0.9375, 'support': 17.0}, 'accuracy': 0.9428571428571428, 'macro avg': {'precision': 0.95, 'recall': 0.9411764705882353, 'f1-score': 0.9424342105263158, 'support': 35.0}, 'weighted avg': {'precision': 0.9485714285714286, 'recall': 0.9428571428571428, 'f1-score': 0.9425751879699249, 'support': 35.0}}}, ('SVM', 160): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}, ('Logistic Regression', 160): {'best_params': {'C': 0.1}, 'train_score': 0.9938271604938271, 'val_score': 0.9705882352941176, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 18.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 17.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35.0}}}}\n" + "Training Naive Bayes...\n", + "Training KNN...\n", + "Training ZeroR...\n", + "{('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 54675): {'best_params': {}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 54675): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 54675): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('SVM', 10): {'best_params': {'C': 0.1, 'kernel': 'rbf'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'train_score': 0.9797979797979798, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 10): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 200}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 100): {'best_params': {}, 'train_score': 0.9696969696969697, 'val_score': 0.9545454545454546, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 100): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 100): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}}\n", + "Pipeline complete. Results saved to Result/results_liver_pca_20241119_160536.csv\n" ] }, { - "ename": "KeyError", - "evalue": "'best_params'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 32\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# save total resutls\u001b[39;00m\n\u001b[1;32m 31\u001b[0m csv_filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResult/results_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcancer_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_pca_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdatetime\u001b[38;5;241m.\u001b[39mnow()\u001b[38;5;241m.\u001b[39mstrftime(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mH\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mS\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[43msave_results_to_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcsv_filename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline complete. Results saved to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults.json\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[11], line 10\u001b[0m, in \u001b[0;36msave_results_to_csv\u001b[0;34m(results, output_file)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m model_name, result \u001b[38;5;129;01min\u001b[39;00m results\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m pca_size, performance \u001b[38;5;129;01min\u001b[39;00m result\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 7\u001b[0m row \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m: model_name,\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpca_size\u001b[39m\u001b[38;5;124m'\u001b[39m: pca_size,\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbest_params\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mperformance\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbest_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrain_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m: performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mstr\u001b[39m(performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification_report\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;66;03m# Serialize the report as a string\u001b[39;00m\n\u001b[1;32m 15\u001b[0m }\n\u001b[1;32m 16\u001b[0m rows\u001b[38;5;241m.\u001b[39mappend(row)\n\u001b[1;32m 18\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(rows)\n", - "\u001b[0;31mKeyError\u001b[0m: 'best_params'" + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], @@ -229,7 +299,7 @@ "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", "save_results_to_csv(results=all_results, output_file=csv_filename)\n", "\n", - "print(\"Pipeline complete. Results saved to 'results.json'.\")" + "print(\"Pipeline complete. Results saved to \" + csv_filename)" ] } ], From fca045bfd887b36fff1bbfe4d3fdd72468025d32 Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Mon, 25 Nov 2024 14:27:42 -0500 Subject: [PATCH 3/7] using leave one out cross val and L1 now when applicable --- datasets_to_model.ipynb | 244 +++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 139 deletions(-) diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb index c977fc2..55c8fd9 100644 --- a/datasets_to_model.ipynb +++ b/datasets_to_model.ipynb @@ -18,7 +18,7 @@ "from sklearn.dummy import DummyClassifier\n", "from sklearn.metrics import classification_report\n", "from sklearn.preprocessing import StandardScaler\n", - "import json\n", + "from sklearn.model_selection import LeaveOneOut\n", "from datetime import datetime\n" ] }, @@ -68,29 +68,47 @@ "metadata": {}, "outputs": [], "source": [ - "# Train and evaluate models\n", - "def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, models):\n", + "# Train and evaluate models using LOOCV\n", + "def train_and_evaluate_loocv(X, y, models):\n", + " loo = LeaveOneOut()\n", " results = {}\n", - " \n", + "\n", " for model_name, (model, param_grid) in models.items():\n", - " print(f\"Training {model_name}...\")\n", + " print(f\"Training {model_name} with LOOCV...\")\n", + " fold_scores = []\n", + " all_y_true = []\n", + " all_y_pred = []\n", + "\n", + " # GridSearch for hyperparameter tuning\n", " grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n", - " grid_search.fit(X_train, y_train)\n", - " \n", + " grid_search.fit(X, y)\n", " best_model = grid_search.best_estimator_\n", - " train_score = grid_search.best_score_\n", - " val_score = best_model.score(X_val, y_val)\n", - " test_score = best_model.score(X_test, y_test)\n", - " y_pred = best_model.predict(X_test)\n", "\n", - " results[(model_name, X_train.shape[1])] = {\n", + " # Perform LOOCV\n", + " for train_index, test_index in loo.split(X):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = y[train_index], y[test_index]\n", + "\n", + " # Fit and predict with the best model\n", + " best_model.fit(X_train, y_train)\n", + " y_pred = best_model.predict(X_test)\n", + "\n", + " # Track results\n", + " fold_scores.append(best_model.score(X_test, y_test))\n", + " all_y_true.extend(y_test)\n", + " all_y_pred.extend(y_pred)\n", + "\n", + " # Compute overall statistics\n", + " avg_score = np.mean(fold_scores)\n", + " classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n", + "\n", + " # Store results\n", + " results[model_name] = {\n", " 'best_params': grid_search.best_params_,\n", - " 'train_score': train_score,\n", - " 'val_score': val_score,\n", - " 'test_score': test_score,\n", - " 'classification_report': classification_report(y_test, y_pred, output_dict=True)\n", + " 'avg_loocv_score': avg_score,\n", + " 'classification_report': classification_report_dict\n", " }\n", - " \n", + "\n", " return results" ] }, @@ -105,13 +123,12 @@ " print(results)\n", " rows = []\n", " for model_name, result in results.items():\n", + " print(result)\n", " row = {\n", " 'model': model_name[0],\n", " 'pca_size': model_name[1],\n", " 'best_params': result['best_params'],\n", - " 'train_score': result['train_score'],\n", - " 'val_score': result['val_score'],\n", - " 'test_score': result['test_score'],\n", + " 'avg loocv score': result['avg_loocv_score'],\n", " 'classification_report': str(result['classification_report']) # Serialize the report as a string\n", " }\n", " rows.append(row)\n", @@ -132,11 +149,10 @@ " 'n_estimators': [50, 100, 200],\n", " 'max_depth': [None, 10, 20]\n", " }),\n", - " 'SVM': (SVC(random_state=42), {\n", - " 'C': [0.1, 1, 10],\n", - " 'kernel': ['linear', 'rbf']\n", + " 'SVM': (SVC(kernel='linear', random_state=42), {\n", + " 'C': [0.1, 1, 10]\n", " }),\n", - " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500), {\n", + " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n", " 'C': [0.1, 1, 10]\n", " }),\n", " 'Naive Bayes': (GaussianNB(), {}),\n", @@ -148,154 +164,104 @@ "\n", "}\n", "\n", - "pca_to_test = [0, 10, 100]\n", + "pca_to_test = [0, 10, 44]\n", "all_results = {}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CHANGE WHAT DATA IS BEING LOADED HERE" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'Dataset/pancreastic.csv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Load Data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_combine_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/normal.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDataset/pancreastic.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m cancer_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpancreatic\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "Cell \u001b[0;32mIn[2], line 4\u001b[0m, in \u001b[0;36mload_and_combine_datasets\u001b[0;34m(control_path, cancer_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_and_combine_datasets\u001b[39m(control_path, cancer_path):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Load datasets\u001b[39;00m\n\u001b[1;32m 3\u001b[0m control_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(control_path)\n\u001b[0;32m----> 4\u001b[0m cancer_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcancer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Determine the size for undersampling\u001b[39;00m\n\u001b[1;32m 7\u001b[0m sample_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(cancer_df)\n", - "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1663\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1664\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1668\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1669\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Dataset/pancreastic.csv'" - ] - } - ], + "outputs": [], "source": [ "# Load Data\n", - "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/pancreatic.csv\")\n", - "cancer_type = \"pancreatic\"" + "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n", + "cancer_type = \"lung\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "running pca on 0 features\n", - "Training Random Forest...\n", - "Training SVM...\n", - "Training Logistic Regression...\n", - "Training Naive Bayes...\n", - "Training KNN...\n", - "Training ZeroR...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "running pca on 10 features\n", - "Training Random Forest...\n", - "Training SVM...\n", - "Training Logistic Regression...\n", - "Training Naive Bayes...\n", - "Training KNN...\n", - "Training ZeroR...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "running pca on 100 features\n", - "Training Random Forest...\n", - "Training SVM...\n", - "Training Logistic Regression...\n", - "Training Naive Bayes...\n", - "Training KNN...\n", - "Training ZeroR...\n", - "{('Random Forest', 54675): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 54675): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 54675): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 54675): {'best_params': {}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 54675): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 54675): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 100}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('SVM', 10): {'best_params': {'C': 0.1, 'kernel': 'rbf'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'train_score': 0.9797979797979798, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 10): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}, ('Random Forest', 100): {'best_params': {'max_depth': None, 'n_estimators': 200}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('SVM', 100): {'best_params': {'C': 0.1, 'kernel': 'linear'}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Logistic Regression', 100): {'best_params': {'C': 0.1}, 'train_score': 0.98989898989899, 'val_score': 1.0, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('Naive Bayes', 100): {'best_params': {}, 'train_score': 0.9696969696969697, 'val_score': 0.9545454545454546, 'test_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22.0}}}, ('KNN', 100): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'train_score': 0.9800950683303625, 'val_score': 1.0, 'test_score': 0.9545454545454546, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11.0}, '1': {'precision': 0.9166666666666666, 'recall': 1.0, 'f1-score': 0.9565217391304348, 'support': 11.0}, 'accuracy': 0.9545454545454546, 'macro avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}, 'weighted avg': {'precision': 0.9583333333333333, 'recall': 0.9545454545454546, 'f1-score': 0.9544513457556936, 'support': 22.0}}}, ('ZeroR', 100): {'best_params': {}, 'train_score': 0.4898989898989899, 'val_score': 0.5, 'test_score': 0.5, 'classification_report': {'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 11.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 22.0}}}}\n", - "Pipeline complete. Results saved to Result/results_liver_pca_20241119_160536.csv\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/home/wyatt/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" + "Running LOOCV with PCA on 0 features...\n", + "Training Random Forest with LOOCV...\n", + "Training SVM with LOOCV...\n", + "Training Logistic Regression with LOOCV...\n", + "Training Naive Bayes with LOOCV...\n", + "Training KNN with LOOCV...\n", + "Training ZeroR with LOOCV...\n", + "Running LOOCV with PCA on 10 features...\n", + "Training Random Forest with LOOCV...\n", + "Training SVM with LOOCV...\n", + "Training Logistic Regression with LOOCV...\n", + "Training Naive Bayes with LOOCV...\n", + "Training KNN with LOOCV...\n", + "Training ZeroR with LOOCV...\n", + "Running LOOCV with PCA on 44 features...\n", + "Training Random Forest with LOOCV...\n", + "Training SVM with LOOCV...\n", + "Training Logistic Regression with LOOCV...\n", + "Training Naive Bayes with LOOCV...\n", + "Training KNN with LOOCV...\n", + "Training ZeroR with LOOCV...\n", + "{('Random Forest', 0): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('SVM', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Logistic Regression', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 0): {'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('KNN', 0): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 0): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 10): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 44): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 44): {'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('KNN', 44): {'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 44): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", + "{'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", + "{'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", + "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", + "Pipeline complete. Results saved to Result/results_liver_pca_20241125_142350.csv\n" ] } ], "source": [ "\n", - "# Split data\n", - "X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n", - "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n", - "\n", - "# Preprocess data\n", + "# Preprocess Data\n", "scaler = StandardScaler()\n", - "X_train_scaled = scaler.fit_transform(X_train)\n", - "X_val_scaled = scaler.transform(X_val)\n", - "X_test_scaled = scaler.transform(X_test)\n", + "X_scaled = scaler.fit_transform(X)\n", + "\n", "\n", + "all_results = {}\n", "\n", "for num_pca in pca_to_test:\n", - " print(f\"running pca on {num_pca} features\")\n", - " # Run PCA feature reduction\n", + " print(f\"Running LOOCV with PCA on {num_pca} features...\")\n", + " # Apply PCA if specified\n", " if num_pca == 0:\n", - " X_train_pca = X_train_scaled\n", - " X_val_pca = X_val_scaled\n", - " X_test_pca = X_test_scaled\n", + " X_pca = X_scaled\n", " else:\n", - " X_train_pca, pca = pca_data(X_train_scaled, num_pca)\n", - " X_val_pca = pca.transform(X_val_scaled)\n", - " X_test_pca = pca.transform(X_test_scaled)\n", + " X_pca, pca = pca_data(X_scaled, num_pca)\n", "\n", - " # Train and evaluate models\n", - " model_results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)\n", + " # Train and evaluate models using LOOCV\n", + " model_results = train_and_evaluate_loocv(X_pca, y, models_to_test)\n", + " all_results.update({(model_name, num_pca): result for model_name, result in model_results.items()})\n", "\n", - " # Update results\n", - " all_results.update(model_results)\n", " \n", - "# save total resutls\n", + "# save total results\n", "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", "save_results_to_csv(results=all_results, output_file=csv_filename)\n", "\n", From 1c5ad1c8d35321a05d3818271c48bb85e2a1bc6a Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Sat, 30 Nov 2024 23:43:44 -0500 Subject: [PATCH 4/7] reorganized and created aggregate model and inference file --- .gitignore | 9 +- aggregate_and_infer.ipynb | 201 +++++++++++++++++++++ datasets_to_model.ipynb | 293 ------------------------------ train_and_evaluate.ipynb | 363 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 572 insertions(+), 294 deletions(-) create mode 100644 aggregate_and_infer.ipynb delete mode 100644 datasets_to_model.ipynb create mode 100644 train_and_evaluate.ipynb diff --git a/.gitignore b/.gitignore index 6be7e48..e5dd687 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ Dataset/* !Dataset/.gitkeep -.venv/* \ No newline at end of file +.venv/* +ControlDataset/* +Models/* +NotInUseModels/* +Result/* +Scalers/* +TestDataset/* +inference_results.csv \ No newline at end of file diff --git a/aggregate_and_infer.ipynb b/aggregate_and_infer.ipynb new file mode 100644 index 0000000..dbaf357 --- /dev/null +++ b/aggregate_and_infer.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import joblib\n", + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_models(models_folder=\"Models\"):\n", + " \"\"\"Load all saved models from the specified folder.\"\"\"\n", + " models = {}\n", + " for file_name in os.listdir(models_folder):\n", + " if file_name.endswith(\".joblib\"):\n", + " # Extract model name and cancer type from the filename\n", + " model_name, cancer_type = file_name.split(\"_\")\n", + " model_path = os.path.join(models_folder, file_name)\n", + " models[(model_name, cancer_type)] = joblib.load(model_path)\n", + " print(f\"Loaded model: {model_name} for cancer type: {cancer_type}\")\n", + " return models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_data(new_data_path, scaler=None):\n", + " \"\"\"Load and preprocess new data.\"\"\"\n", + " # Load the data\n", + " data = pd.read_csv(new_data_path)\n", + " \n", + " # Drop unnecessary columns (adjust this based on your dataset structure)\n", + " X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')\n", + " \n", + " # Standardize the data\n", + " if scaler is None:\n", + " scaler = StandardScaler()\n", + " X_scaled = scaler.fit_transform(X)\n", + " else:\n", + " X_scaled = scaler.transform(X)\n", + " \n", + " return X_scaled, data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_inference(models, test_data_path, scalers_folder=\"Scalers\"):\n", + " # Load the test data\n", + " test_df = pd.read_csv(test_data_path)\n", + " test_features = test_df.drop(['cancer_type', 'type'], axis=1) # Drop label columns if they exist\n", + "\n", + " # List all scaler files in the Scalers folder\n", + " scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]\n", + " \n", + " results = []\n", + " \n", + " for index, row in test_df.iterrows():\n", + " row_predictions = {} # Store model predictions and their confidence\n", + " \n", + " for (model_name, cancer_type), model in models.items():\n", + " # Find the scaler corresponding to the cancer type\n", + " cancer_type, _ = cancer_type.split(\".\")\n", + " scaler_filename = f\"{cancer_type}_scaler.joblib\"\n", + " if scaler_filename in scaler_files:\n", + " scaler_path = os.path.join(scalers_folder, scaler_filename)\n", + " scaler = joblib.load(scaler_path)\n", + " test_features_scaled = scaler.transform([test_features.iloc[index]]) # Transform a single row\n", + " \n", + " # Run inference with the model\n", + " probabilities = model.predict_proba(test_features_scaled)\n", + " confidence = probabilities[0][1]\n", + " \n", + " # Store the model's prediction and confidence\n", + " row_predictions[f\"{model_name}-{cancer_type}\"] = {\n", + " 'cancer_type': cancer_type,\n", + " 'predicted_class': model.predict(test_features_scaled)[0],\n", + " 'confidence': confidence,\n", + " 'probabilities': probabilities[0].tolist()\n", + " }\n", + " else:\n", + " print(f\"couldn't find {cancer_type}_scaler.joblib\")\n", + "\n", + " # Determine the final prediction based on the highest confidence\n", + " final_prediction = \"normal\"\n", + " max_confidence = -1\n", + " \n", + " for model_name, prediction_info in row_predictions.items():\n", + " if prediction_info['confidence'] > max_confidence and prediction_info['confidence'] > 0.5:\n", + " max_confidence = prediction_info['confidence']\n", + " final_prediction = prediction_info['cancer_type']\n", + "\n", + " # Append the final prediction for the current data point\n", + " results.append({\n", + " 'index': index,\n", + " 'cancer_type': final_prediction,\n", + " 'confidence': max_confidence if final_prediction is not \"normal\" else -1.0,\n", + " 'predictions': row_predictions\n", + " })\n", + "\n", + " return results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_inference_results(results, output_file=\"inference_results.csv\"):\n", + " # Convert the results to a DataFrame\n", + " df_results = pd.DataFrame(results)\n", + " df_results.to_csv(output_file, index=False)\n", + " print(f\"Inference results saved to {output_file}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the updated function\n", + "models_folder = \"Models\"\n", + "new_data_path = \"TestDataset/test_data.csv\"\n", + "\n", + "# Load models\n", + "models = load_models(models_folder=models_folder)\n", + "\n", + "\n", + "# Run inference\n", + "inference_results = run_inference(models, new_data_path)\n", + "\n", + "# Save results\n", + "save_inference_results(inference_results, output_file=\"inference_results.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 88.24%\n" + ] + } + ], + "source": [ + "test_df = pd.read_csv(\"TestDataset/test_data.csv\")\n", + "\n", + "predictions_df = pd.read_csv(\"inference_results.csv\")\n", + "\n", + "\n", + "accuracy = (predictions_df[\"cancer_type\"] == test_df[\"cancer_type\"]).mean()\n", + "print(f\"Accuracy: {accuracy:.2%}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/datasets_to_model.ipynb b/datasets_to_model.ipynb deleted file mode 100644 index 55c8fd9..0000000 --- a/datasets_to_model.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.svm import SVC\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.naive_bayes import GaussianNB\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "from sklearn.dummy import DummyClassifier\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.model_selection import LeaveOneOut\n", - "from datetime import datetime\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def load_and_combine_datasets(control_path, cancer_path):\n", - " # Load datasets\n", - " control_df = pd.read_csv(control_path)\n", - " cancer_df = pd.read_csv(cancer_path)\n", - " \n", - " # Determine the size for undersampling\n", - " sample_size = len(cancer_df)\n", - " \n", - " # Undersample the healthy dataset\n", - " control_df_sample = control_df.sample(n=sample_size, random_state=42)\n", - " \n", - " # Combine datasets and shuffle\n", - " combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)\n", - " \n", - " # Split into features and labels\n", - " X = combined_df.drop(['cancer_type', 'type'], axis=1)\n", - " y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0) # 1 = cancer, 0 = healthy\n", - " \n", - " return X, y\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def pca_data(X, n_components):\n", - " # Apply PCA (assumes X is already standardized)\n", - " pca = PCA(n_components=n_components)\n", - " X_pca = pca.fit_transform(X)\n", - " return X_pca, pca" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Train and evaluate models using LOOCV\n", - "def train_and_evaluate_loocv(X, y, models):\n", - " loo = LeaveOneOut()\n", - " results = {}\n", - "\n", - " for model_name, (model, param_grid) in models.items():\n", - " print(f\"Training {model_name} with LOOCV...\")\n", - " fold_scores = []\n", - " all_y_true = []\n", - " all_y_pred = []\n", - "\n", - " # GridSearch for hyperparameter tuning\n", - " grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n", - " grid_search.fit(X, y)\n", - " best_model = grid_search.best_estimator_\n", - "\n", - " # Perform LOOCV\n", - " for train_index, test_index in loo.split(X):\n", - " X_train, X_test = X[train_index], X[test_index]\n", - " y_train, y_test = y[train_index], y[test_index]\n", - "\n", - " # Fit and predict with the best model\n", - " best_model.fit(X_train, y_train)\n", - " y_pred = best_model.predict(X_test)\n", - "\n", - " # Track results\n", - " fold_scores.append(best_model.score(X_test, y_test))\n", - " all_y_true.extend(y_test)\n", - " all_y_pred.extend(y_pred)\n", - "\n", - " # Compute overall statistics\n", - " avg_score = np.mean(fold_scores)\n", - " classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n", - "\n", - " # Store results\n", - " results[model_name] = {\n", - " 'best_params': grid_search.best_params_,\n", - " 'avg_loocv_score': avg_score,\n", - " 'classification_report': classification_report_dict\n", - " }\n", - "\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def save_results_to_csv(results, output_file):\n", - " # Convert the results dictionary into a DataFrame\n", - " print(results)\n", - " rows = []\n", - " for model_name, result in results.items():\n", - " print(result)\n", - " row = {\n", - " 'model': model_name[0],\n", - " 'pca_size': model_name[1],\n", - " 'best_params': result['best_params'],\n", - " 'avg loocv score': result['avg_loocv_score'],\n", - " 'classification_report': str(result['classification_report']) # Serialize the report as a string\n", - " }\n", - " rows.append(row)\n", - " \n", - " df = pd.DataFrame(rows)\n", - " df.to_csv(output_file, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Define models and parameter grids\n", - "models_to_test = {\n", - " 'Random Forest': (RandomForestClassifier(random_state=42), {\n", - " 'n_estimators': [50, 100, 200],\n", - " 'max_depth': [None, 10, 20]\n", - " }),\n", - " 'SVM': (SVC(kernel='linear', random_state=42), {\n", - " 'C': [0.1, 1, 10]\n", - " }),\n", - " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n", - " 'C': [0.1, 1, 10]\n", - " }),\n", - " 'Naive Bayes': (GaussianNB(), {}),\n", - " 'KNN': (KNeighborsClassifier(), {\n", - " 'n_neighbors': [3, 5, 7, 10],\n", - " 'weights': ['uniform', 'distance']\n", - " }),\n", - " 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}), # ZeroR always predicts the most frequent class\n", - "\n", - "}\n", - "\n", - "pca_to_test = [0, 10, 44]\n", - "all_results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "CHANGE WHAT DATA IS BEING LOADED HERE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load Data\n", - "X, y = load_and_combine_datasets(\"Dataset/normal.csv\", \"Dataset/lung.csv\")\n", - "cancer_type = \"lung\"" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running LOOCV with PCA on 0 features...\n", - "Training Random Forest with LOOCV...\n", - "Training SVM with LOOCV...\n", - "Training Logistic Regression with LOOCV...\n", - "Training Naive Bayes with LOOCV...\n", - "Training KNN with LOOCV...\n", - "Training ZeroR with LOOCV...\n", - "Running LOOCV with PCA on 10 features...\n", - "Training Random Forest with LOOCV...\n", - "Training SVM with LOOCV...\n", - "Training Logistic Regression with LOOCV...\n", - "Training Naive Bayes with LOOCV...\n", - "Training KNN with LOOCV...\n", - "Training ZeroR with LOOCV...\n", - "Running LOOCV with PCA on 44 features...\n", - "Training Random Forest with LOOCV...\n", - "Training SVM with LOOCV...\n", - "Training Logistic Regression with LOOCV...\n", - "Training Naive Bayes with LOOCV...\n", - "Training KNN with LOOCV...\n", - "Training ZeroR with LOOCV...\n", - "{('Random Forest', 0): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('SVM', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Logistic Regression', 0): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 0): {'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('KNN', 0): {'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 0): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 10): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 10): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Naive Bayes', 10): {'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('KNN', 10): {'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 10): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}, ('Random Forest', 44): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('SVM', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}, ('Logistic Regression', 44): {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, ('Naive Bayes', 44): {'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}, ('KNN', 44): {'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}, ('ZeroR', 44): {'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}}\n", - "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", - "{'best_params': {'n_neighbors': 3, 'weights': 'uniform'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", - "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, '1': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", - "{'best_params': {'n_neighbors': 3, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", - "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9791666666666666, 'classification_report': {'0': {'precision': 0.9859154929577465, 'recall': 0.9722222222222222, 'f1-score': 0.979020979020979, 'support': 72.0}, '1': {'precision': 0.9726027397260274, 'recall': 0.9861111111111112, 'f1-score': 0.9793103448275863, 'support': 72.0}, 'accuracy': 0.9791666666666666, 'macro avg': {'precision': 0.9792591163418869, 'recall': 0.9791666666666667, 'f1-score': 0.9791656619242826, 'support': 144.0}, 'weighted avg': {'precision': 0.979259116341887, 'recall': 0.9791666666666666, 'f1-score': 0.9791656619242826, 'support': 144.0}}}\n", - "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, '1': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}, 'weighted avg': {'precision': 0.9861111111111112, 'recall': 0.9861111111111112, 'f1-score': 0.9861111111111112, 'support': 144.0}}}\n", - "{'best_params': {'n_neighbors': 5, 'weights': 'distance'}, 'avg_loocv_score': 0.9861111111111112, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9722222222222222, 'f1-score': 0.9859154929577464, 'support': 72.0}, '1': {'precision': 0.972972972972973, 'recall': 1.0, 'f1-score': 0.9863013698630138, 'support': 72.0}, 'accuracy': 0.9861111111111112, 'macro avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.9861084314103801, 'support': 144.0}, 'weighted avg': {'precision': 0.9864864864864865, 'recall': 0.9861111111111112, 'f1-score': 0.98610843141038, 'support': 144.0}}}\n", - "{'best_params': {}, 'avg_loocv_score': 0.0, 'classification_report': {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 144.0}}}\n", - "Pipeline complete. Results saved to Result/results_liver_pca_20241125_142350.csv\n" - ] - } - ], - "source": [ - "\n", - "# Preprocess Data\n", - "scaler = StandardScaler()\n", - "X_scaled = scaler.fit_transform(X)\n", - "\n", - "\n", - "all_results = {}\n", - "\n", - "for num_pca in pca_to_test:\n", - " print(f\"Running LOOCV with PCA on {num_pca} features...\")\n", - " # Apply PCA if specified\n", - " if num_pca == 0:\n", - " X_pca = X_scaled\n", - " else:\n", - " X_pca, pca = pca_data(X_scaled, num_pca)\n", - "\n", - " # Train and evaluate models using LOOCV\n", - " model_results = train_and_evaluate_loocv(X_pca, y, models_to_test)\n", - " all_results.update({(model_name, num_pca): result for model_name, result in model_results.items()})\n", - "\n", - " \n", - "# save total results\n", - "csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", - "save_results_to_csv(results=all_results, output_file=csv_filename)\n", - "\n", - "print(\"Pipeline complete. Results saved to \" + csv_filename)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/train_and_evaluate.ipynb b/train_and_evaluate.ipynb new file mode 100644 index 0000000..2582c97 --- /dev/null +++ b/train_and_evaluate.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.dummy import DummyClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import LeaveOneOut\n", + "from datetime import datetime\n", + "import os\n", + "import joblib\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def load_and_combine_datasets(control_path, cancer_path):\n", + " # Load datasets\n", + " control_df = pd.read_csv(control_path)\n", + " cancer_df = pd.read_csv(cancer_path)\n", + " \n", + " # Determine the size for undersampling\n", + " sample_size = len(cancer_df)\n", + " \n", + " # Undersample the healthy dataset\n", + " control_df_sample = control_df.sample(n=sample_size)\n", + " \n", + " # Combine datasets and shuffle\n", + " combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)\n", + " \n", + " # Split into features and labels\n", + " X = combined_df.drop(['cancer_type', 'type'], axis=1)\n", + " y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0) # 1 = cancer, 0 = healthy\n", + " \n", + " return X, y\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def pca_data(X, n_components):\n", + " # Apply PCA (assumes X is already standardized)\n", + " pca = PCA(n_components=n_components)\n", + " X_pca = pca.fit_transform(X)\n", + " return X_pca, pca" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def save_model(model, model_name, cancer_type, folder=\"Models\"):\n", + " \"\"\"Save the model to a file using joblib.\"\"\"\n", + " if not os.path.exists(folder):\n", + " os.makedirs(folder) # Create directory if it doesn't exist\n", + " model_filename = f\"{folder}/{model_name}_{cancer_type}.joblib\"\n", + " joblib.dump(model, model_filename)\n", + " print(f\"Model {model_name} saved to {model_filename}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Train and evaluate models using LOOCV\n", + "def train_and_evaluate_loocv(X, y, models, cancer_type):\n", + " loo = LeaveOneOut()\n", + " results = {}\n", + "\n", + " for model_name, (model, param_grid) in models.items():\n", + " print(f\"Training {model_name} with LOOCV...\")\n", + " fold_scores = []\n", + " all_y_true = []\n", + " all_y_pred = []\n", + "\n", + " # GridSearch for hyperparameter tuning\n", + " grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)\n", + " grid_search.fit(X, y)\n", + " best_model = grid_search.best_estimator_\n", + " # Save the best model\n", + " save_model(best_model, model_name, cancer_type)\n", + "\n", + " # Perform LOOCV\n", + " for train_index, test_index in loo.split(X):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = y[train_index], y[test_index]\n", + "\n", + " # Fit and predict with the best model\n", + " best_model.fit(X_train, y_train)\n", + " y_pred = best_model.predict(X_test)\n", + "\n", + " # Track results\n", + " fold_scores.append(best_model.score(X_test, y_test))\n", + " all_y_true.extend(y_test)\n", + " all_y_pred.extend(y_pred)\n", + "\n", + " # Compute overall statistics\n", + " avg_score = np.mean(fold_scores)\n", + " classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n", + "\n", + " # Store results\n", + " results[model_name] = {\n", + " 'best_params': grid_search.best_params_,\n", + " 'avg_loocv_score': avg_score,\n", + " 'classification_report': classification_report_dict\n", + " }\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def save_results_to_csv(results, output_file):\n", + " # Convert the results dictionary into a DataFrame\n", + " print(results)\n", + " rows = []\n", + " for model_name, result in results.items():\n", + " print(result)\n", + " row = {\n", + " 'model': model_name,\n", + " 'best_params': result['best_params'],\n", + " 'avg loocv score': result['avg_loocv_score'],\n", + " 'classification_report': str(result['classification_report']) # Serialize the report as a string\n", + " }\n", + " rows.append(row)\n", + " \n", + " df = pd.DataFrame(rows)\n", + " df.to_csv(output_file, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Define models and parameter grids\n", + "models_to_test = {\n", + " 'Random Forest': (RandomForestClassifier(random_state=42), {\n", + " 'n_estimators': [50, 100],\n", + " 'max_depth': [None, 10]\n", + " }),\n", + " # 'SVM': (SVC(kernel='linear', random_state=42), {\n", + " # 'C': [0.1, 1, 10]\n", + " # }),\n", + " 'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {\n", + " 'C': [0.1, 1]\n", + " }),\n", + " # 'Naive Bayes': (GaussianNB(), {}),\n", + " # 'KNN': (KNeighborsClassifier(), {\n", + " # 'n_neighbors': [3, 5, 7, 10],\n", + " # 'weights': ['uniform', 'distance']\n", + " # }),\n", + " # 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}), # ZeroR always predicts the most frequent class\n", + "\n", + "}\n", + "\n", + "all_results = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing datasets: Dataset/throat.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_throat.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_throat.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}\n", + "Pipeline complete. Results saved to Result/results_throat_pca_20241130_210102.csv\n", + "Processing datasets: Dataset/prostate.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_prostate.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_prostate.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9347826086956522, 'classification_report': {'0': {'precision': 0.9761904761904762, 'recall': 0.8913043478260869, 'f1-score': 0.9318181818181818, 'support': 46.0}, '1': {'precision': 0.9, 'recall': 0.9782608695652174, 'f1-score': 0.9375, 'support': 46.0}, 'accuracy': 0.9347826086956522, 'macro avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956521, 'f1-score': 0.9346590909090908, 'support': 92.0}, 'weighted avg': {'precision': 0.9380952380952381, 'recall': 0.9347826086956522, 'f1-score': 0.9346590909090908, 'support': 92.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9565217391304348, 'classification_report': {'0': {'precision': 0.9772727272727273, 'recall': 0.9347826086956522, 'f1-score': 0.9555555555555557, 'support': 46.0}, '1': {'precision': 0.9375, 'recall': 0.9782608695652174, 'f1-score': 0.9574468085106383, 'support': 46.0}, 'accuracy': 0.9565217391304348, 'macro avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.9565011820330971, 'support': 92.0}, 'weighted avg': {'precision': 0.9573863636363636, 'recall': 0.9565217391304348, 'f1-score': 0.956501182033097, 'support': 92.0}}}\n", + "Pipeline complete. Results saved to Result/results_prostate_pca_20241130_210230.csv\n", + "Processing datasets: Dataset/leukemia.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_leukemia.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_leukemia.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 66.0}}}\n", + "Pipeline complete. Results saved to Result/results_leukemia_pca_20241130_210320.csv\n", + "Processing datasets: Dataset/lung.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_lung.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_lung.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9826086956521739, 'classification_report': {'0': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, '1': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 115.0}, 'accuracy': 0.9826086956521739, 'macro avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}, 'weighted avg': {'precision': 0.9826086956521739, 'recall': 0.9826086956521739, 'f1-score': 0.9826086956521739, 'support': 230.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9782608695652174, 'classification_report': {'0': {'precision': 0.9824561403508771, 'recall': 0.9739130434782609, 'f1-score': 0.9781659388646288, 'support': 115.0}, '1': {'precision': 0.9741379310344828, 'recall': 0.9826086956521739, 'f1-score': 0.9783549783549784, 'support': 115.0}, 'accuracy': 0.9782608695652174, 'macro avg': {'precision': 0.9782970356926799, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}, 'weighted avg': {'precision': 0.97829703569268, 'recall': 0.9782608695652174, 'f1-score': 0.9782604586098036, 'support': 230.0}}}\n", + "Pipeline complete. Results saved to Result/results_lung_pca_20241130_211200.csv\n", + "Processing datasets: Dataset/colorectal.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_colorectal.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_colorectal.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.990625, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.98125, 'f1-score': 0.9905362776025236, 'support': 160.0}, '1': {'precision': 0.9815950920245399, 'recall': 1.0, 'f1-score': 0.9907120743034055, 'support': 160.0}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}, 'weighted avg': {'precision': 0.99079754601227, 'recall': 0.990625, 'f1-score': 0.9906241759529646, 'support': 320.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.996875, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.99375, 'f1-score': 0.9968652037617556, 'support': 160.0}, '1': {'precision': 0.9937888198757764, 'recall': 1.0, 'f1-score': 0.9968847352024921, 'support': 160.0}, 'accuracy': 0.996875, 'macro avg': {'precision': 0.9968944099378882, 'recall': 0.996875, 'f1-score': 0.9968749694821238, 'support': 320.0}, 'weighted avg': {'precision': 0.996894409937888, 'recall': 0.996875, 'f1-score': 0.9968749694821237, 'support': 320.0}}}\n", + "Pipeline complete. Results saved to Result/results_colorectal_pca_20241130_212335.csv\n", + "Processing datasets: Dataset/liver.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_liver.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_liver.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.9863013698630136, 'recall': 1.0, 'f1-score': 0.993103448275862, 'support': 72.0}, 'accuracy': 0.9930555555555556, 'macro avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414276, 'support': 144.0}, 'weighted avg': {'precision': 0.9931506849315068, 'recall': 0.9930555555555556, 'f1-score': 0.9930552206414275, 'support': 144.0}}}\n", + "Pipeline complete. Results saved to Result/results_liver_pca_20241130_212555.csv\n", + "Processing datasets: Dataset/renal.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_renal.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_renal.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.9620253164556962, 'classification_report': {'0': {'precision': 0.974025974025974, 'recall': 0.9493670886075949, 'f1-score': 0.9615384615384615, 'support': 79.0}, '1': {'precision': 0.9506172839506173, 'recall': 0.9746835443037974, 'f1-score': 0.9625, 'support': 79.0}, 'accuracy': 0.9620253164556962, 'macro avg': {'precision': 0.9623216289882957, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692308, 'support': 158.0}, 'weighted avg': {'precision': 0.9623216289882954, 'recall': 0.9620253164556962, 'f1-score': 0.9620192307692309, 'support': 158.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9367088607594937, 'classification_report': {'0': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, '1': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 79.0}, 'accuracy': 0.9367088607594937, 'macro avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}, 'weighted avg': {'precision': 0.9367088607594937, 'recall': 0.9367088607594937, 'f1-score': 0.9367088607594937, 'support': 158.0}}}\n", + "Pipeline complete. Results saved to Result/results_renal_pca_20241130_213009.csv\n", + "Processing datasets: Dataset/bladder.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_bladder.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_bladder.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 43.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 86.0}}}\n", + "Pipeline complete. Results saved to Result/results_bladder_pca_20241130_213111.csv\n", + "Processing datasets: Dataset/gastric.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_gastric.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_gastric.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.95, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9, 'f1-score': 0.9473684210526316, 'support': 20.0}, '1': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1-score': 0.9523809523809523, 'support': 20.0}, 'accuracy': 0.95, 'macro avg': {'precision': 0.9545454545454546, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}, 'weighted avg': {'precision': 0.9545454545454545, 'recall': 0.95, 'f1-score': 0.949874686716792, 'support': 40.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.925, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20.0}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20.0}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40.0}}}\n", + "Pipeline complete. Results saved to Result/results_gastric_pca_20241130_213147.csv\n", + "Processing datasets: Dataset/brain.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_brain.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_brain.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}, 'Logistic Regression': {'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n", + "{'best_params': {'C': 1}, 'avg_loocv_score': 0.9892086330935251, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9784172661870504, 'f1-score': 0.989090909090909, 'support': 139.0}, '1': {'precision': 0.9788732394366197, 'recall': 1.0, 'f1-score': 0.9893238434163701, 'support': 139.0}, 'accuracy': 0.9892086330935251, 'macro avg': {'precision': 0.9894366197183099, 'recall': 0.9892086330935252, 'f1-score': 0.9892073762536395, 'support': 278.0}, 'weighted avg': {'precision': 0.9894366197183098, 'recall': 0.9892086330935251, 'f1-score': 0.9892073762536397, 'support': 278.0}}}\n", + "Pipeline complete. Results saved to Result/results_brain_pca_20241130_213953.csv\n", + "Processing datasets: Dataset/breast.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_breast.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_breast.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9980694980694981, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9961389961389961, 'f1-score': 0.9980657640232108, 'support': 259.0}, '1': {'precision': 0.9961538461538462, 'recall': 1.0, 'f1-score': 0.9980732177263969, 'support': 259.0}, 'accuracy': 0.9980694980694981, 'macro avg': {'precision': 0.9980769230769231, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748038, 'support': 518.0}, 'weighted avg': {'precision': 0.998076923076923, 'recall': 0.9980694980694981, 'f1-score': 0.9980694908748037, 'support': 518.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 1.0, 'classification_report': {'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 259.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 518.0}}}\n", + "Pipeline complete. Results saved to Result/results_breast_pca_20241130_221107.csv\n", + "Processing datasets: Dataset/pancreatic.csv\n", + "Training Random Forest with LOOCV...\n", + "Model Random Forest saved to Models/Random Forest_pancreatic.joblib\n", + "Training Logistic Regression with LOOCV...\n", + "Model Logistic Regression saved to Models/Logistic Regression_pancreatic.joblib\n", + "{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}}\n", + "{'best_params': {'max_depth': None, 'n_estimators': 100}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n", + "{'best_params': {'C': 0.1}, 'avg_loocv_score': 0.96875, 'classification_report': {'0': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, '1': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 32.0}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64.0}}}\n", + "Pipeline complete. Results saved to Result/results_pancreatic_pca_20241130_221213.csv\n" + ] + } + ], + "source": [ + "\n", + "dataset_files = [f for f in os.listdir(\"Dataset\") if f.endswith('.csv')]\n", + "for dataset_file in dataset_files:\n", + " file_path = os.path.join(\"Dataset\", dataset_file)\n", + "\n", + " cancer_type = os.path.splitext(file_path)[0].split('/')[-1]\n", + " print(f\"Processing datasets: {file_path}\")\n", + "\n", + " X, y = load_and_combine_datasets(\"ControlDataset/normal.csv\", file_path)\n", + "\n", + " scaler = StandardScaler()\n", + " X_scaled = scaler.fit_transform(X)\n", + " joblib.dump(scaler, f\"Scalers/{cancer_type}_scaler.joblib\")\n", + "\n", + "\n", + " all_results = {}\n", + "\n", + "\n", + " # Train and evaluate models using LOOCV\n", + " model_results = train_and_evaluate_loocv(X_scaled, y, models_to_test, cancer_type)\n", + " all_results.update({model_name: result for model_name, result in model_results.items()})\n", + "\n", + " \n", + " # save total results\n", + " csv_filename = f\"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv\"\n", + " save_results_to_csv(results=all_results, output_file=csv_filename)\n", + "\n", + " print(\"Pipeline complete. Results saved to \" + csv_filename)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 99c09eeb25a7c0cb31fdce96a12021728e279a87 Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Sun, 1 Dec 2024 11:36:54 -0500 Subject: [PATCH 5/7] added way to visualize most impactful logistic weights --- .gitignore | 4 +- visualize_logistic_model.ipynb | 196 +++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 visualize_logistic_model.ipynb diff --git a/.gitignore b/.gitignore index e5dd687..b0f7530 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ NotInUseModels/* Result/* Scalers/* TestDataset/* -inference_results.csv \ No newline at end of file +inference_results.csv +FeatureWeights/* +CombinedFeatureWeights.csv \ No newline at end of file diff --git a/visualize_logistic_model.ipynb b/visualize_logistic_model.ipynb new file mode 100644 index 0000000..0a4acff --- /dev/null +++ b/visualize_logistic_model.ipynb @@ -0,0 +1,196 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Define paths\n", + "models_folder = \"Models\" # Path to the folder containing model files\n", + "test_data_path = \"TestDataset/test_data.csv\" # Path to test data\n", + "output_folder = \"FeatureWeights\" # Folder to save output CSVs\n", + "os.makedirs(output_folder, exist_ok=True) # Create output folder if it doesn't exist\n", + "\n", + "# Load feature names from the test data\n", + "feature_names = pd.read_csv(test_data_path, nrows=0).columns.tolist()[2:]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing model: Logistic Regression_pancreatic.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_pancreatic_weights.csv\n", + "Processing model: Logistic Regression_brain.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_brain_weights.csv\n", + "Processing model: Logistic Regression_leukemia.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_leukemia_weights.csv\n", + "Processing model: Logistic Regression_gastric.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_gastric_weights.csv\n", + "Processing model: Logistic Regression_colorectal.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_colorectal_weights.csv\n", + "Processing model: Logistic Regression_lung.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_lung_weights.csv\n", + "Processing model: Logistic Regression_breast.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_breast_weights.csv\n", + "Processing model: Logistic Regression_renal.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_renal_weights.csv\n", + "Processing model: Logistic Regression_liver.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_liver_weights.csv\n", + "Processing model: Logistic Regression_bladder.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_bladder_weights.csv\n", + "Processing model: Logistic Regression_throat.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_throat_weights.csv\n", + "Processing model: Logistic Regression_prostate.joblib\n", + "Saved feature weights to: FeatureWeights/Logistic Regression_prostate_weights.csv\n", + "Processing complete. All feature weights saved.\n" + ] + } + ], + "source": [ + "# Iterate through all joblib files in the Models folder\n", + "for model_file in os.listdir(models_folder):\n", + " if model_file.endswith(\".joblib\"):\n", + " model_path = os.path.join(models_folder, model_file)\n", + " print(f\"Processing model: {model_file}\")\n", + " \n", + " # Load the logistic regression model\n", + " model = joblib.load(model_path)\n", + "\n", + " # Extract coefficients\n", + " coefficients = model.coef_[0] # Assuming binary classification; modify for multi-class\n", + " \n", + " # Pair feature names with coefficients\n", + " feature_weights = pd.DataFrame({\n", + " \"Feature\": feature_names,\n", + " \"Weight\": coefficients\n", + " })\n", + "\n", + " # Sort by weight\n", + " feature_weights = feature_weights.sort_values(by=\"Weight\", ascending=False)\n", + "\n", + " # Save to CSV\n", + " output_file = os.path.join(output_folder, f\"{os.path.splitext(model_file)[0]}_weights.csv\")\n", + " feature_weights.to_csv(output_file, index=False)\n", + "\n", + " print(f\"Saved feature weights to: {output_file}\")\n", + "\n", + "print(\"Processing complete. All feature weights saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def combine_feature_weights(feature_weights_folder, output_file):\n", + " combined_data = [] # To store all rows of the final table\n", + "\n", + " # Iterate through all CSV files in the FeatureWeights folder\n", + " for file_name in os.listdir(feature_weights_folder):\n", + " if file_name.endswith(\"_weights.csv\"):\n", + " cancer_type = file_name.split(\"_weights.csv\")[0] # Extract cancer type from the file name\n", + " \n", + " # Load feature weights\n", + " file_path = os.path.join(feature_weights_folder, file_name)\n", + " df = pd.read_csv(file_path)\n", + " \n", + " # Get top 5 positive and negative weights\n", + " top_positive = df.nlargest(5, \"Weight\").reset_index(drop=True)\n", + " top_negative = df.nsmallest(5, \"Weight\").reset_index(drop=True)\n", + " \n", + " # Prepare a row for this cancer type\n", + " row = {\"Cancer Type\": cancer_type}\n", + " for i in range(5): # Add up to 5 features and weights\n", + " row[f\"Positive Feature {i+1}\"] = (\n", + " top_positive.at[i, \"Feature\"] if i < len(top_positive) else \"\"\n", + " )\n", + "\n", + " for i in range(5):\n", + " row[f\"Negative Feature {i+1}\"] = (\n", + " top_negative.at[i, \"Feature\"] if i < len(top_negative) else \"\"\n", + " )\n", + " \n", + " # Add weights in separate columns\n", + " for i in range(5):\n", + " row[f\"Positive Weight {i+1}\"] = (\n", + " top_positive.at[i, \"Weight\"] if i < len(top_positive) else \"\"\n", + " )\n", + " \n", + " for i in range(5):\n", + " row[f\"Negative Weight {i+1}\"] = (\n", + " top_negative.at[i, \"Weight\"] if i < len(top_negative) else \"\"\n", + " )\n", + " combined_data.append(row)\n", + " \n", + " # Create a DataFrame for the combined data\n", + " combined_df = pd.DataFrame(combined_data)\n", + " \n", + " # Save to a CSV file\n", + " combined_df.to_csv(output_file, index=False)\n", + " print(f\"Combined feature weights saved to: {output_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Combined feature weights saved to: CombinedFeatureWeights.csv\n" + ] + } + ], + "source": [ + "# Specify input and output paths\n", + "feature_weights_folder = \"FeatureWeights\" # Folder containing individual feature weights CSVs\n", + "output_file = \"CombinedFeatureWeights.csv\" # Output file for the combined results\n", + "\n", + "# Call the function\n", + "combine_feature_weights(feature_weights_folder, output_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a09b8c85ab9a27ffc3f371558a900947d5411116 Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Mon, 2 Dec 2024 18:44:03 -0500 Subject: [PATCH 6/7] reorganized baseline pipeline --- ...pynb => baseline_aggregate_and_infer.ipynb | 7 ++--- ...ipynb => baseline_train_and_evaluate.ipynb | 26 +++++-------------- ...=> baseline_visualize_logistic_model.ipynb | 25 +++++++----------- 3 files changed, 17 insertions(+), 41 deletions(-) rename aggregate_and_infer.ipynb => baseline_aggregate_and_infer.ipynb (96%) rename train_and_evaluate.ipynb => baseline_train_and_evaluate.ipynb (98%) rename visualize_logistic_model.ipynb => baseline_visualize_logistic_model.ipynb (89%) diff --git a/aggregate_and_infer.ipynb b/baseline_aggregate_and_infer.ipynb similarity index 96% rename from aggregate_and_infer.ipynb rename to baseline_aggregate_and_infer.ipynb index dbaf357..04ce609 100644 --- a/aggregate_and_infer.ipynb +++ b/baseline_aggregate_and_infer.ipynb @@ -24,7 +24,6 @@ " models = {}\n", " for file_name in os.listdir(models_folder):\n", " if file_name.endswith(\".joblib\"):\n", - " # Extract model name and cancer type from the filename\n", " model_name, cancer_type = file_name.split(\"_\")\n", " model_path = os.path.join(models_folder, file_name)\n", " models[(model_name, cancer_type)] = joblib.load(model_path)\n", @@ -42,11 +41,9 @@ " \"\"\"Load and preprocess new data.\"\"\"\n", " # Load the data\n", " data = pd.read_csv(new_data_path)\n", - " \n", - " # Drop unnecessary columns (adjust this based on your dataset structure)\n", " X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')\n", " \n", - " # Standardize the data\n", + " # standardize the data\n", " if scaler is None:\n", " scaler = StandardScaler()\n", " X_scaled = scaler.fit_transform(X)\n", @@ -65,7 +62,7 @@ "def run_inference(models, test_data_path, scalers_folder=\"Scalers\"):\n", " # Load the test data\n", " test_df = pd.read_csv(test_data_path)\n", - " test_features = test_df.drop(['cancer_type', 'type'], axis=1) # Drop label columns if they exist\n", + " test_features = test_df.drop(['cancer_type', 'type'], axis=1)\n", "\n", " # List all scaler files in the Scalers folder\n", " scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]\n", diff --git a/train_and_evaluate.ipynb b/baseline_train_and_evaluate.ipynb similarity index 98% rename from train_and_evaluate.ipynb rename to baseline_train_and_evaluate.ipynb index 2582c97..d9d3440 100644 --- a/train_and_evaluate.ipynb +++ b/baseline_train_and_evaluate.ipynb @@ -55,27 +55,14 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def pca_data(X, n_components):\n", - " # Apply PCA (assumes X is already standardized)\n", - " pca = PCA(n_components=n_components)\n", - " X_pca = pca.fit_transform(X)\n", - " return X_pca, pca" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def save_model(model, model_name, cancer_type, folder=\"Models\"):\n", " \"\"\"Save the model to a file using joblib.\"\"\"\n", " if not os.path.exists(folder):\n", - " os.makedirs(folder) # Create directory if it doesn't exist\n", + " os.makedirs(folder)\n", " model_filename = f\"{folder}/{model_name}_{cancer_type}.joblib\"\n", " joblib.dump(model, model_filename)\n", " print(f\"Model {model_name} saved to {model_filename}\")" @@ -83,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -105,12 +92,11 @@ " # Save the best model\n", " save_model(best_model, model_name, cancer_type)\n", "\n", - " # Perform LOOCV\n", + " # perform LOOCV\n", " for train_index, test_index in loo.split(X):\n", " X_train, X_test = X[train_index], X[test_index]\n", " y_train, y_test = y[train_index], y[test_index]\n", "\n", - " # Fit and predict with the best model\n", " best_model.fit(X_train, y_train)\n", " y_pred = best_model.predict(X_test)\n", "\n", @@ -119,7 +105,7 @@ " all_y_true.extend(y_test)\n", " all_y_pred.extend(y_pred)\n", "\n", - " # Compute overall statistics\n", + " # overall stats\n", " avg_score = np.mean(fold_scores)\n", " classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)\n", "\n", @@ -159,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/visualize_logistic_model.ipynb b/baseline_visualize_logistic_model.ipynb similarity index 89% rename from visualize_logistic_model.ipynb rename to baseline_visualize_logistic_model.ipynb index 0a4acff..4c90610 100644 --- a/visualize_logistic_model.ipynb +++ b/baseline_visualize_logistic_model.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -21,15 +21,14 @@ "models_folder = \"Models\" # Path to the folder containing model files\n", "test_data_path = \"TestDataset/test_data.csv\" # Path to test data\n", "output_folder = \"FeatureWeights\" # Folder to save output CSVs\n", - "os.makedirs(output_folder, exist_ok=True) # Create output folder if it doesn't exist\n", + "os.makedirs(output_folder, exist_ok=True)\n", "\n", - "# Load feature names from the test data\n", "feature_names = pd.read_csv(test_data_path, nrows=0).columns.tolist()[2:]" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -71,11 +70,10 @@ " model_path = os.path.join(models_folder, model_file)\n", " print(f\"Processing model: {model_file}\")\n", " \n", - " # Load the logistic regression model\n", " model = joblib.load(model_path)\n", "\n", " # Extract coefficients\n", - " coefficients = model.coef_[0] # Assuming binary classification; modify for multi-class\n", + " coefficients = model.coef_[0]\n", " \n", " # Pair feature names with coefficients\n", " feature_weights = pd.DataFrame({\n", @@ -83,10 +81,8 @@ " \"Weight\": coefficients\n", " })\n", "\n", - " # Sort by weight\n", " feature_weights = feature_weights.sort_values(by=\"Weight\", ascending=False)\n", "\n", - " # Save to CSV\n", " output_file = os.path.join(output_folder, f\"{os.path.splitext(model_file)[0]}_weights.csv\")\n", " feature_weights.to_csv(output_file, index=False)\n", "\n", @@ -107,7 +103,7 @@ " # Iterate through all CSV files in the FeatureWeights folder\n", " for file_name in os.listdir(feature_weights_folder):\n", " if file_name.endswith(\"_weights.csv\"):\n", - " cancer_type = file_name.split(\"_weights.csv\")[0] # Extract cancer type from the file name\n", + " cancer_type = file_name.split(\"_weights.csv\")[0]\n", " \n", " # Load feature weights\n", " file_path = os.path.join(feature_weights_folder, file_name)\n", @@ -141,17 +137,15 @@ " )\n", " combined_data.append(row)\n", " \n", - " # Create a DataFrame for the combined data\n", - " combined_df = pd.DataFrame(combined_data)\n", - " \n", " # Save to a CSV file\n", + " combined_df = pd.DataFrame(combined_data)\n", " combined_df.to_csv(output_file, index=False)\n", " print(f\"Combined feature weights saved to: {output_file}\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -164,10 +158,9 @@ ], "source": [ "# Specify input and output paths\n", - "feature_weights_folder = \"FeatureWeights\" # Folder containing individual feature weights CSVs\n", - "output_file = \"CombinedFeatureWeights.csv\" # Output file for the combined results\n", + "feature_weights_folder = \"FeatureWeights\"\n", + "output_file = \"CombinedFeatureWeights.csv\" \n", "\n", - "# Call the function\n", "combine_feature_weights(feature_weights_folder, output_file)" ] } From 9390fb75cf839061038a52ae4a6cf6494f4be9df Mon Sep 17 00:00:00 2001 From: wyatt522 Date: Mon, 2 Dec 2024 18:49:22 -0500 Subject: [PATCH 7/7] reorganized files --- .../baseline_aggregate_and_infer.ipynb | 0 .../baseline_train_and_evaluate.ipynb | 0 .../baseline_visualize_logistic_model.ipynb | 0 clustering.ipynb => Clustering/clustering.ipynb | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename baseline_aggregate_and_infer.ipynb => Baseline/baseline_aggregate_and_infer.ipynb (100%) rename baseline_train_and_evaluate.ipynb => Baseline/baseline_train_and_evaluate.ipynb (100%) rename baseline_visualize_logistic_model.ipynb => Baseline/baseline_visualize_logistic_model.ipynb (100%) rename clustering.ipynb => Clustering/clustering.ipynb (100%) diff --git a/baseline_aggregate_and_infer.ipynb b/Baseline/baseline_aggregate_and_infer.ipynb similarity index 100% rename from baseline_aggregate_and_infer.ipynb rename to Baseline/baseline_aggregate_and_infer.ipynb diff --git a/baseline_train_and_evaluate.ipynb b/Baseline/baseline_train_and_evaluate.ipynb similarity index 100% rename from baseline_train_and_evaluate.ipynb rename to Baseline/baseline_train_and_evaluate.ipynb diff --git a/baseline_visualize_logistic_model.ipynb b/Baseline/baseline_visualize_logistic_model.ipynb similarity index 100% rename from baseline_visualize_logistic_model.ipynb rename to Baseline/baseline_visualize_logistic_model.ipynb diff --git a/clustering.ipynb b/Clustering/clustering.ipynb similarity index 100% rename from clustering.ipynb rename to Clustering/clustering.ipynb