Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 245 additions & 15 deletions your-code/lab_imbalance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,20 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures, MinMaxScaler\n",
"import seaborn as sns\n",
"\n",
"import pandas as pd\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score"
]
},
{
Expand All @@ -44,11 +57,153 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Your response here"
"data = pd.read_csv('PS_20174392719_1491204439457_log.csv').sample(100000)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100000, 11)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"step int64\n",
"type object\n",
"amount float64\n",
"nameOrig object\n",
"oldbalanceOrg float64\n",
"newbalanceOrig float64\n",
"nameDest object\n",
"oldbalanceDest float64\n",
"newbalanceDest float64\n",
"isFraud int64\n",
"isFlaggedFraud int64\n",
"dtype: object"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>step</th>\n",
" <th>type</th>\n",
" <th>amount</th>\n",
" <th>nameOrig</th>\n",
" <th>oldbalanceOrg</th>\n",
" <th>newbalanceOrig</th>\n",
" <th>nameDest</th>\n",
" <th>oldbalanceDest</th>\n",
" <th>newbalanceDest</th>\n",
" <th>isFraud</th>\n",
" <th>isFlaggedFraud</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4465129</th>\n",
" <td>324</td>\n",
" <td>CASH_OUT</td>\n",
" <td>668156.34</td>\n",
" <td>C1251348464</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>C497599109</td>\n",
" <td>17157234.83</td>\n",
" <td>17825391.17</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5870062</th>\n",
" <td>403</td>\n",
" <td>TRANSFER</td>\n",
" <td>2544832.46</td>\n",
" <td>C478732865</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>C1754164558</td>\n",
" <td>7158922.07</td>\n",
" <td>9703754.54</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5956605</th>\n",
" <td>405</td>\n",
" <td>CASH_OUT</td>\n",
" <td>209591.50</td>\n",
" <td>C1733037259</td>\n",
" <td>51908.0</td>\n",
" <td>0.0</td>\n",
" <td>C711944103</td>\n",
" <td>0.00</td>\n",
" <td>209591.50</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" step type amount nameOrig oldbalanceOrg \\\n",
"4465129 324 CASH_OUT 668156.34 C1251348464 0.0 \n",
"5870062 403 TRANSFER 2544832.46 C478732865 0.0 \n",
"5956605 405 CASH_OUT 209591.50 C1733037259 51908.0 \n",
"\n",
" newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
"4465129 0.0 C497599109 17157234.83 17825391.17 0 \n",
"5870062 0.0 C1754164558 7158922.07 9703754.54 0 \n",
"5956605 0.0 C711944103 0.00 209591.50 0 \n",
"\n",
" isFlaggedFraud \n",
"4465129 0 \n",
"5870062 0 \n",
"5956605 0 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(data.shape)\n",
"display(data.dtypes)\n",
"display(data.head(3))\n"
]
},
{
Expand All @@ -60,27 +215,77 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Your code here\n"
"y = fraud['isFraud']\n",
"X = fraud.drop(labels='isFraud', axis=1)\n",
"\n",
"\n",
"# divide train test\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run a logisitc regression classifier and evaluate its accuracy."
"### Run a logistic regression classifier and evaluate its accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No samples remaining after dropping missing values.\n"
]
}
],
"source": [
"# Your code here"
"import pandas as pd\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"# Assuming your training data is stored in the X_train DataFrame\n",
"# and the target variable is stored in the y_train Series\n",
"\n",
"# Identify the column(s) with string values\n",
"string_columns = X_train.select_dtypes(include='object').columns\n",
"\n",
"# Convert the string columns to numeric values\n",
"for column in string_columns:\n",
" X_train[column] = pd.to_numeric(X_train[column], errors='coerce')\n",
"\n",
"# Drop rows with missing values\n",
"X_train.dropna(inplace=True)\n",
"y_train = y_train[X_train.index]\n",
"\n",
"# Check if there are enough samples remaining\n",
"if len(X_train) == 0:\n",
" print(\"No samples remaining after dropping missing values.\")\n",
"else:\n",
" # Split the data into training and testing sets\n",
" X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
"\n",
" # Fit the logistic regression model\n",
" lr = LogisticRegression()\n",
" lr.fit(X_train, y_train)\n",
"\n",
" # Make predictions on the test set\n",
" y_pred = lr.predict(X_test)\n",
"\n",
" # Calculate the accuracy of the model\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" print(f\"Logistic Regression Test Accuracy: {accuracy * 100:.2f}%\")\n",
"\n"
]
},
{
Expand All @@ -92,11 +297,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 40,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ValueError",
"evalue": "Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[40], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtree\u001b[39;00m \u001b[39mimport\u001b[39;00m DecisionTreeClassifier\n\u001b[0;32m 2\u001b[0m dtc \u001b[39m=\u001b[39m DecisionTreeClassifier()\n\u001b[1;32m----> 3\u001b[0m dtc\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[0;32m 4\u001b[0m y_pred \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 5\u001b[0m acc \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mscore(X_test, y_test)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m\n",
"File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:889\u001b[0m, in \u001b[0;36mDecisionTreeClassifier.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 859\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mfit\u001b[39m(\u001b[39mself\u001b[39m, X, y, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, check_input\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[0;32m 860\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Build a decision tree classifier from the training set (X, y).\u001b[39;00m\n\u001b[0;32m 861\u001b[0m \n\u001b[0;32m 862\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 886\u001b[0m \u001b[39m Fitted estimator.\u001b[39;00m\n\u001b[0;32m 887\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mfit(\n\u001b[0;32m 890\u001b[0m X,\n\u001b[0;32m 891\u001b[0m y,\n\u001b[0;32m 892\u001b[0m sample_weight\u001b[39m=\u001b[39;49msample_weight,\n\u001b[0;32m 893\u001b[0m check_input\u001b[39m=\u001b[39;49mcheck_input,\n\u001b[0;32m 894\u001b[0m )\n\u001b[0;32m 895\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
"File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:186\u001b[0m, in \u001b[0;36mBaseDecisionTree.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 184\u001b[0m check_X_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(dtype\u001b[39m=\u001b[39mDTYPE, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsc\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 185\u001b[0m check_y_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(ensure_2d\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m)\n\u001b[1;32m--> 186\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 187\u001b[0m X, y, validate_separately\u001b[39m=\u001b[39;49m(check_X_params, check_y_params)\n\u001b[0;32m 188\u001b[0m )\n\u001b[0;32m 189\u001b[0m \u001b[39mif\u001b[39;00m issparse(X):\n\u001b[0;32m 190\u001b[0m X\u001b[39m.\u001b[39msort_indices()\n",
"File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\base.py:579\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 577\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_X_params:\n\u001b[0;32m 578\u001b[0m check_X_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params}\n\u001b[1;32m--> 579\u001b[0m X \u001b[39m=\u001b[39m check_array(X, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params)\n\u001b[0;32m 580\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_y_params:\n\u001b[0;32m 581\u001b[0m check_y_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params}\n",
"File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:931\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n\u001b[0;32m 930\u001b[0m \u001b[39mif\u001b[39;00m n_samples \u001b[39m<\u001b[39m ensure_min_samples:\n\u001b[1;32m--> 931\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 932\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m sample(s) (shape=\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m) while a\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 933\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m minimum of \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m is required\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 934\u001b[0m \u001b[39m%\u001b[39m (n_samples, array\u001b[39m.\u001b[39mshape, ensure_min_samples, context)\n\u001b[0;32m 935\u001b[0m )\n\u001b[0;32m 937\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_features \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m array\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[0;32m 938\u001b[0m n_features \u001b[39m=\u001b[39m array\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\n",
"\u001b[1;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier."
]
}
],
"source": [
"# Your code here"
"from sklearn.tree import DecisionTreeClassifier\n",
"dtc = DecisionTreeClassifier()\n",
"dtc.fit(X_train, y_train)\n",
"y_pred = dtc.predict(X_test)\n",
"acc = dtc.score(X_test, y_test)*100\n",
"print(f\"Decision Tree Test Accuracy {round(acc, 2)}%\")\n",
"cm = confusion_matrix(y_test, y_pred)\n",
"print(cm)\n",
"\n",
"print('Precision ', cm[1,1]/(cm[0,1] + cm[1,1]))"
]
},
{
Expand All @@ -108,11 +338,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Your response here"
"# I completely and utterly messed up and don't even know how to fix it even with chatGPT"
]
},
{
Expand All @@ -139,7 +369,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.10.9"
}
},
"nbformat": 4,
Expand Down