diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..93181cb 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -28,11 +28,597 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "\n",
+ "## Importing libraries\n",
+ "\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "import imblearn\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.linear_model import DecisionTreeClassifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1000000, 11)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3971530 | \n",
+ " 297 | \n",
+ " PAYMENT | \n",
+ " 825.54 | \n",
+ " C1937876142 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " M1022738370 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1371594 | \n",
+ " 138 | \n",
+ " PAYMENT | \n",
+ " 14862.89 | \n",
+ " C64401466 | \n",
+ " 61650.00 | \n",
+ " 46787.11 | \n",
+ " M959468027 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 118535 | \n",
+ " 11 | \n",
+ " CASH_OUT | \n",
+ " 53321.81 | \n",
+ " C929643735 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " C409060521 | \n",
+ " 2042322.34 | \n",
+ " 2665856.06 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5193766 | \n",
+ " 369 | \n",
+ " PAYMENT | \n",
+ " 11216.40 | \n",
+ " C141620937 | \n",
+ " 214929.59 | \n",
+ " 203713.19 | \n",
+ " M424037423 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5140264 | \n",
+ " 356 | \n",
+ " CASH_OUT | \n",
+ " 117.05 | \n",
+ " C218898777 | \n",
+ " 38428.86 | \n",
+ " 38311.81 | \n",
+ " C160840537 | \n",
+ " 1490943.54 | \n",
+ " 1491060.59 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n",
+ "3971530 297 PAYMENT 825.54 C1937876142 0.00 0.00 \n",
+ "1371594 138 PAYMENT 14862.89 C64401466 61650.00 46787.11 \n",
+ "118535 11 CASH_OUT 53321.81 C929643735 0.00 0.00 \n",
+ "5193766 369 PAYMENT 11216.40 C141620937 214929.59 203713.19 \n",
+ "5140264 356 CASH_OUT 117.05 C218898777 38428.86 38311.81 \n",
+ "\n",
+ " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "3971530 M1022738370 0.00 0.00 0 0 \n",
+ "1371594 M959468027 0.00 0.00 0 0 \n",
+ "118535 C409060521 2042322.34 2665856.06 0 0 \n",
+ "5193766 M424037423 0.00 0.00 0 0 \n",
+ "5140264 C160840537 1490943.54 1491060.59 0 0 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.read_csv(\"/Users/caionunez/Downloads/PS_20174392719_1491204439457_log.csv\").sample(1000000)\n",
+ "print(data.shape)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 1000000.000000 | \n",
+ " 1.000000e+06 | \n",
+ " 1.000000e+06 | \n",
+ " 1.000000e+06 | \n",
+ " 1.000000e+06 | \n",
+ " 1.000000e+06 | \n",
+ " 1000000.000000 | \n",
+ " 1000000.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 243.405510 | \n",
+ " 1.806395e+05 | \n",
+ " 8.328807e+05 | \n",
+ " 8.543091e+05 | \n",
+ " 1.099738e+06 | \n",
+ " 1.224374e+06 | \n",
+ " 0.001340 | \n",
+ " 0.000001 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 142.476961 | \n",
+ " 6.185435e+05 | \n",
+ " 2.883078e+06 | \n",
+ " 2.918763e+06 | \n",
+ " 3.388608e+06 | \n",
+ " 3.676593e+06 | \n",
+ " 0.036581 | \n",
+ " 0.001000 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 155.000000 | \n",
+ " 1.341511e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 239.000000 | \n",
+ " 7.503728e+04 | \n",
+ " 1.424776e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 1.329255e+05 | \n",
+ " 2.138667e+05 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 335.000000 | \n",
+ " 2.089625e+05 | \n",
+ " 1.072873e+05 | \n",
+ " 1.451902e+05 | \n",
+ " 9.445696e+05 | \n",
+ " 1.112732e+06 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 743.000000 | \n",
+ " 6.384799e+07 | \n",
+ " 5.958504e+07 | \n",
+ " 4.958504e+07 | \n",
+ " 3.249151e+08 | \n",
+ " 3.555534e+08 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig \\\n",
+ "count 1000000.000000 1.000000e+06 1.000000e+06 1.000000e+06 \n",
+ "mean 243.405510 1.806395e+05 8.328807e+05 8.543091e+05 \n",
+ "std 142.476961 6.185435e+05 2.883078e+06 2.918763e+06 \n",
+ "min 1.000000 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "25% 155.000000 1.341511e+04 0.000000e+00 0.000000e+00 \n",
+ "50% 239.000000 7.503728e+04 1.424776e+04 0.000000e+00 \n",
+ "75% 335.000000 2.089625e+05 1.072873e+05 1.451902e+05 \n",
+ "max 743.000000 6.384799e+07 5.958504e+07 4.958504e+07 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "count 1.000000e+06 1.000000e+06 1000000.000000 1000000.000000 \n",
+ "mean 1.099738e+06 1.224374e+06 0.001340 0.000001 \n",
+ "std 3.388608e+06 3.676593e+06 0.036581 0.001000 \n",
+ "min 0.000000e+00 0.000000e+00 0.000000 0.000000 \n",
+ "25% 0.000000e+00 0.000000e+00 0.000000 0.000000 \n",
+ "50% 1.329255e+05 2.138667e+05 0.000000 0.000000 \n",
+ "75% 9.445696e+05 1.112732e+06 0.000000 0.000000 \n",
+ "max 3.249151e+08 3.555534e+08 1.000000 1.000000 "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step int64\n",
+ "type object\n",
+ "amount float64\n",
+ "nameOrig object\n",
+ "oldbalanceOrg float64\n",
+ "newbalanceOrig float64\n",
+ "nameDest object\n",
+ "oldbalanceDest float64\n",
+ "newbalanceDest float64\n",
+ "isFraud int64\n",
+ "isFlaggedFraud int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## I'm thinking the outcome mentioned is the represented in the column \"isFraud\"\n",
+ "\n",
+ "data.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 998660\n",
+ "1 1340\n",
+ "Name: isFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"isFraud\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 999999\n",
+ "1 1\n",
+ "Name: isFlaggedFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"isFlaggedFraud\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CASH_OUT 350876\n",
+ "PAYMENT 337961\n",
+ "CASH_IN 220737\n",
+ "TRANSFER 83854\n",
+ "DEBIT 6572\n",
+ "Name: type, dtype: int64"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"type\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | step | \n",
+ " 1.000000 | \n",
+ " 0.022152 | \n",
+ " -0.010426 | \n",
+ " -0.010768 | \n",
+ " 0.026199 | \n",
+ " 0.024653 | \n",
+ " 0.032884 | \n",
+ " 0.002180 | \n",
+ "
\n",
+ " \n",
+ " | amount | \n",
+ " 0.022152 | \n",
+ " 1.000000 | \n",
+ " -0.002622 | \n",
+ " -0.007568 | \n",
+ " 0.305381 | \n",
+ " 0.472149 | \n",
+ " 0.071674 | \n",
+ " 0.005490 | \n",
+ "
\n",
+ " \n",
+ " | oldbalanceOrg | \n",
+ " -0.010426 | \n",
+ " -0.002622 | \n",
+ " 1.000000 | \n",
+ " 0.998807 | \n",
+ " 0.067611 | \n",
+ " 0.043007 | \n",
+ " 0.009904 | \n",
+ " 0.000952 | \n",
+ "
\n",
+ " \n",
+ " | newbalanceOrig | \n",
+ " -0.010768 | \n",
+ " -0.007568 | \n",
+ " 0.998807 | \n",
+ " 1.000000 | \n",
+ " 0.069179 | \n",
+ " 0.042784 | \n",
+ " -0.007904 | \n",
+ " 0.000933 | \n",
+ "
\n",
+ " \n",
+ " | oldbalanceDest | \n",
+ " 0.026199 | \n",
+ " 0.305381 | \n",
+ " 0.067611 | \n",
+ " 0.069179 | \n",
+ " 1.000000 | \n",
+ " 0.976190 | \n",
+ " -0.006897 | \n",
+ " -0.000325 | \n",
+ "
\n",
+ " \n",
+ " | newbalanceDest | \n",
+ " 0.024653 | \n",
+ " 0.472149 | \n",
+ " 0.043007 | \n",
+ " 0.042784 | \n",
+ " 0.976190 | \n",
+ " 1.000000 | \n",
+ " -0.000392 | \n",
+ " -0.000333 | \n",
+ "
\n",
+ " \n",
+ " | isFraud | \n",
+ " 0.032884 | \n",
+ " 0.071674 | \n",
+ " 0.009904 | \n",
+ " -0.007904 | \n",
+ " -0.006897 | \n",
+ " -0.000392 | \n",
+ " 1.000000 | \n",
+ " 0.027300 | \n",
+ "
\n",
+ " \n",
+ " | isFlaggedFraud | \n",
+ " 0.002180 | \n",
+ " 0.005490 | \n",
+ " 0.000952 | \n",
+ " 0.000933 | \n",
+ " -0.000325 | \n",
+ " -0.000333 | \n",
+ " 0.027300 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig \\\n",
+ "step 1.000000 0.022152 -0.010426 -0.010768 \n",
+ "amount 0.022152 1.000000 -0.002622 -0.007568 \n",
+ "oldbalanceOrg -0.010426 -0.002622 1.000000 0.998807 \n",
+ "newbalanceOrig -0.010768 -0.007568 0.998807 1.000000 \n",
+ "oldbalanceDest 0.026199 0.305381 0.067611 0.069179 \n",
+ "newbalanceDest 0.024653 0.472149 0.043007 0.042784 \n",
+ "isFraud 0.032884 0.071674 0.009904 -0.007904 \n",
+ "isFlaggedFraud 0.002180 0.005490 0.000952 0.000933 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "step 0.026199 0.024653 0.032884 0.002180 \n",
+ "amount 0.305381 0.472149 0.071674 0.005490 \n",
+ "oldbalanceOrg 0.067611 0.043007 0.009904 0.000952 \n",
+ "newbalanceOrig 0.069179 0.042784 -0.007904 0.000933 \n",
+ "oldbalanceDest 1.000000 0.976190 -0.006897 -0.000325 \n",
+ "newbalanceDest 0.976190 1.000000 -0.000392 -0.000333 \n",
+ "isFraud -0.006897 -0.000392 1.000000 0.027300 \n",
+ "isFlaggedFraud -0.000325 -0.000333 0.027300 1.000000 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.corr()\n",
+ "\n",
+ "## None of the columns appear to be highly correlated"
]
},
{
@@ -44,11 +630,52 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 998660\n",
+ "1 1340\n",
+ "Name: isFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your response here\n",
+ "\n",
+ "## Briging the value counts for the column again. I'm thinking that with this result, considering it's the\n",
+ "## imbalance labs, we should do balance them out before proceeding.\n",
+ "\n",
+ "data[\"isFraud\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGnCAYAAABl41fiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAan0lEQVR4nO3db2yd5X3/8Y9JiMNKbUSymqQ1IbCBooUycFqWhGijG24DQorERia0hjKQapU1S9xUI81UlgjN7QQ060oCFUmjSBmLWP+IrV4XS9NKIExq3KT7Q7Z2hOIAdi0HzQ5p5ZD47AG/ePLPDuSYkAs7r5d0HpzL133O90gNfve+zzmuqVQqlQAAFHJe6QEAgHObGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIqaUDHy9NNP59Zbb83s2bNTU1OT73znO1U/RqVSyYMPPpgrr7wytbW1aWxszJ//+Z+f+WEBgNMytfQA1Th69Giuueaa3HXXXbntttvG9Rh//Md/nF27duXBBx/M1Vdfnf7+/vT19Z3hSQGA01UzUf9QXk1NTb797W9n2bJlw2vHjh3Ln/7pn2bHjh35n//5n8yfPz9f/vKX81u/9VtJkgMHDuTDH/5w/v3f/z1XXXVVmcEBgBEm1GWat3PXXXfl2Wefzd/8zd/kX//1X/N7v/d7+cQnPpGf/OQnSZK/+7u/y+WXX56///u/z9y5c3PZZZflnnvuyWuvvVZ4cgA4d02aGHnhhRfyxBNP5Mknn8ySJUtyxRVXZM2aNbnhhhvyjW98I0ly8ODBvPTSS3nyySezffv2bNu2LZ2dnfnd3/3dwtMDwLlrQr1n5K388Ic/TKVSyZVXXjlifXBwMDNmzEiSDA0NZXBwMNu3bx/et2XLljQ1NeW//uu/XLoBgAImTYwMDQ1lypQp6ezszJQpU0b87MILL0ySzJo1K1OnTh0RLPPmzUuSdHV1iREAKGDSxMi1116bEydOpLe3N0uWLBlzz+LFi3P8+PG88MILueKKK5IkP/7xj5Mkc+bMOWuzAgD/Z0J9mub111/Pf//3fyd5Mz4efvjh3Hjjjbn44otz6aWX5g/+4A/y7LPP5qGHHsq1116bvr6+/NM//VOuvvrq3HzzzRkaGspHPvKRXHjhhdm4cWOGhoZy7733pq6uLrt27Sr86gDg3DShYuSf//mfc+ONN45av/POO7Nt27a88cYbeeCBB7J9+/a88sormTFjRhYuXJj169fn6quvTpK8+uqr+exnP5tdu3blfe97X5YuXZqHHnooF1988dl+OQBAJliMAACTz6T5aC8AMDGJEQCgqAnxaZqhoaG8+uqref/735+amprS4wAAp6FSqeTIkSOZPXt2zjvv1Oc/JkSMvPrqq2lsbCw9BgAwDocOHcqHPvShU/58QsTI+9///iRvvpi6urrC0wAAp2NgYCCNjY3Dv8dPZULEyMlLM3V1dWIEACaYt3uLhTewAgBFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKCoqmPk6aefzq233prZs2enpqYm3/nOd972mO9///tpamrK9OnTc/nll+fRRx8dz6wAwCRUdYwcPXo011xzTb72ta+d1v4XX3wxN998c5YsWZJ9+/blC1/4QlauXJlvfvObVQ8LAEw+Vf+hvKVLl2bp0qWnvf/RRx/NpZdemo0bNyZJ5s2bl7179+bBBx/MbbfdVu3TAwCTzLv+npHnnnsuzc3NI9Y+/vGPZ+/evXnjjTfGPGZwcDADAwMjbgDA5FT1mZFq9fT0pKGhYcRaQ0NDjh8/nr6+vsyaNWvUMW1tbVm/fv27PdqEcNl93y09AmfRT790S+kRAM66s/JpmpqamhH3K5XKmOsnrV27Nv39/cO3Q4cOveszAgBlvOtnRi655JL09PSMWOvt7c3UqVMzY8aMMY+pra1NbW3tuz0aAPAe8K6fGVm4cGE6OjpGrO3atSsLFizI+eef/24/PQDwHld1jLz++uvZv39/9u/fn+TNj+7u378/XV1dSd68xLJixYrh/S0tLXnppZfS2tqaAwcOZOvWrdmyZUvWrFlzZl4BADChVX2ZZu/evbnxxhuH77e2tiZJ7rzzzmzbti3d3d3DYZIkc+fOTXt7e1avXp1HHnkks2fPzle/+lUf6wUAkiQ1lZPvJn0PGxgYSH19ffr7+1NXV1d6nLPKp2nOLT5NA0wmp/v729+mAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKGpcMbJp06bMnTs306dPT1NTU3bv3v2W+3fs2JFrrrkmv/RLv5RZs2blrrvuyuHDh8c1MAAwuVQdIzt37syqVauybt267Nu3L0uWLMnSpUvT1dU15v5nnnkmK1asyN13353/+I//yJNPPpkf/OAHueeee97x8ADAxFd1jDz88MO5++67c88992TevHnZuHFjGhsbs3nz5jH3/8u//Esuu+yyrFy5MnPnzs0NN9yQT3/609m7d+87Hh4AmPiqipFjx46ls7Mzzc3NI9abm5uzZ8+eMY9ZtGhRXn755bS3t6dSqeRnP/tZ/vZv/za33HLLKZ9ncHAwAwMDI24AwORUVYz09fXlxIkTaWhoGLHe0NCQnp6eMY9ZtGhRduzYkeXLl2fatGm55JJLctFFF+Wv/uqvTvk8bW1tqa+vH741NjZWMyYAMIGM6w2sNTU1I+5XKpVRayc9//zzWblyZb74xS+ms7Mz3/ve9/Liiy+mpaXllI+/du3a9Pf3D98OHTo0njEBgAlgajWbZ86cmSlTpow6C9Lb2zvqbMlJbW1tWbx4cT7/+c8nST784Q/nfe97X5YsWZIHHnggs2bNGnVMbW1tamtrqxkNAJigqjozMm3atDQ1NaWjo2PEekdHRxYtWjTmMT//+c9z3nkjn2bKlClJ3jyjAgCc26q+TNPa2prHH388W7duzYEDB7J69ep0dXUNX3ZZu3ZtVqxYMbz/1ltvzbe+9a1s3rw5Bw8ezLPPPpuVK1fmox/9aGbPnn3mXgkAMCFVdZkmSZYvX57Dhw9nw4YN6e7uzvz589Pe3p45c+YkSbq7u0d858inPvWpHDlyJF/72tfyuc99LhdddFE+9rGP5ctf/vKZexUAwIRVU5kA10oGBgZSX1+f/v7+1NXVlR7nrLrsvu+WHoGz6KdfOvVH3gEmmtP9/e1v0wAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosYVI5s2bcrcuXMzffr0NDU1Zffu3W+5f3BwMOvWrcucOXNSW1ubK664Ilu3bh3XwADA5DK12gN27tyZVatWZdOmTVm8eHEee+yxLF26NM8//3wuvfTSMY+5/fbb87Of/SxbtmzJr/zKr6S3tzfHjx9/x8MDABNfTaVSqVRzwPXXX5/rrrsumzdvHl6bN29eli1blra2tlH7v/e97+X3f//3c/DgwVx88cXjGnJgYCD19fXp7+9PXV3duB5jorrsvu+WHoGz6KdfuqX0CABnzOn+/q7qMs2xY8fS2dmZ5ubmEevNzc3Zs2fPmMc89dRTWbBgQf7iL/4iH/zgB3PllVdmzZo1+cUvfnHK5xkcHMzAwMCIGwAwOVV1maavry8nTpxIQ0PDiPWGhob09PSMeczBgwfzzDPPZPr06fn2t7+dvr6+fOYzn8lrr712yveNtLW1Zf369dWMBgBMUON6A2tNTc2I+5VKZdTaSUNDQ6mpqcmOHTvy0Y9+NDfffHMefvjhbNu27ZRnR9auXZv+/v7h26FDh8YzJgAwAVR1ZmTmzJmZMmXKqLMgvb29o86WnDRr1qx88IMfTH19/fDavHnzUqlU8vLLL+dXf/VXRx1TW1ub2traakYDACaoqs6MTJs2LU1NTeno6Bix3tHRkUWLFo15zOLFi/Pqq6/m9ddfH1778Y9/nPPOOy8f+tCHxjEyADCZVH2ZprW1NY8//ni2bt2aAwcOZPXq1enq6kpLS0uSNy+xrFixYnj/HXfckRkzZuSuu+7K888/n6effjqf//zn84d/+Ie54IILztwrAQAmpKq/Z2T58uU5fPhwNmzYkO7u7syfPz/t7e2ZM2dOkqS7uztdXV3D+y+88MJ0dHTks5/9bBYsWJAZM2bk9ttvzwMPPHDmXgUAMGFV/T0jJfieEc4VvmcEmEzele8ZAQA408QIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoaV4xs2rQpc+fOzfTp09PU1JTdu3ef1nHPPvtspk6dml//9V8fz9MCAJNQ1TGyc+fOrFq1KuvWrcu+ffuyZMmSLF26NF1dXW95XH9/f1asWJHf/u3fHvewAMDkU3WMPPzww7n77rtzzz33ZN68edm4cWMaGxuzefPmtzzu05/+dO64444sXLhw3MMCAJNPVTFy7NixdHZ2prm5ecR6c3Nz9uzZc8rjvvGNb+SFF17I/ffff1rPMzg4mIGBgRE3AGByqipG+vr6cuLEiTQ0NIxYb2hoSE9Pz5jH/OQnP8l9992XHTt2ZOrUqaf1PG1tbamvrx++NTY2VjMmADCBjOsNrDU1NSPuVyqVUWtJcuLEidxxxx1Zv359rrzyytN+/LVr16a/v3/4dujQofGMCQBMAKd3quL/mTlzZqZMmTLqLEhvb++osyVJcuTIkezduzf79u3LH/3RHyVJhoaGUqlUMnXq1OzatSsf+9jHRh1XW1ub2traakYDACaoqs6MTJs2LU1NTeno6Bix3tHRkUWLFo3aX1dXl3/7t3/L/v37h28tLS256qqrsn///lx//fXvbHoAYMKr6sxIkrS2tuaTn/xkFixYkIULF+brX/96urq60tLSkuTNSyyvvPJKtm/fnvPOOy/z588fcfwHPvCBTJ8+fdQ6AHBuqjpGli9fnsOHD2fDhg3p7u7O/Pnz097enjlz5iRJuru73/Y7RwAATqqpVCqV0kO8nYGBgdTX16e/vz91dXWlxzmrLrvvu6VH4Cz66ZduKT0CwBlzur+//W0aAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFDUuGJk06ZNmTt3bqZPn56mpqbs3r37lHu/9a1v5aabbsov//Ivp66uLgsXLsw//uM/jntgAGByqTpGdu7cmVWrVmXdunXZt29flixZkqVLl6arq2vM/U8//XRuuummtLe3p7OzMzfeeGNuvfXW7Nu37x0PDwBMfDWVSqVSzQHXX399rrvuumzevHl4bd68eVm2bFna2tpO6zF+7dd+LcuXL88Xv/jF09o/MDCQ+vr69Pf3p66urppxJ7zL7vtu6RE4i376pVtKjwBwxpzu7++qzowcO3YsnZ2daW5uHrHe3NycPXv2nNZjDA0N5ciRI7n44otPuWdwcDADAwMjbgDA5FRVjPT19eXEiRNpaGgYsd7Q0JCenp7TeoyHHnooR48eze23337KPW1tbamvrx++NTY2VjMmADCBjOsNrDU1NSPuVyqVUWtjeeKJJ/Jnf/Zn2blzZz7wgQ+cct/atWvT398/fDt06NB4xgQAJoCp1WyeOXNmpkyZMuosSG9v76izJf+/nTt35u67786TTz6Z3/md33nLvbW1tamtra1mNABggqrqzMi0adPS1NSUjo6OEesdHR1ZtGjRKY974okn8qlPfSp//dd/nVtu8QY9AOD/VHVmJElaW1vzyU9+MgsWLMjChQvz9a9/PV1dXWlpaUny5iWWV155Jdu3b0/yZoisWLEif/mXf5nf+I3fGD6rcsEFF6S+vv4MvhQAYCKqOkaWL1+ew4cPZ8OGDenu7s78+fPT3t6eOXPmJEm6u7tHfOfIY489luPHj+fee+/NvffeO7x+5513Ztu2be/8FQAAE1rV3zNSgu8Z4Vzhe0aAyeRd+Z4RAIAzTYwAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoKhxxcimTZsyd+7cTJ8+PU1NTdm9e/db7v/+97+fpqamTJ8+PZdffnkeffTRcQ0LAEw+VcfIzp07s2rVqqxbty779u3LkiVLsnTp0nR1dY25/8UXX8zNN9+cJUuWZN++ffnCF76QlStX5pvf/OY7Hh4AmPhqKpVKpZoDrr/++lx33XXZvHnz8Nq8efOybNmytLW1jdr/J3/yJ3nqqady4MCB4bWWlpb86Ec/ynPPPXdazzkwMJD6+vr09/enrq6umnEnvMvu+27pETiLfvqlW0qPAHDGnO7v76nVPOixY8fS2dmZ++67b8R6c3Nz9uzZM+Yxzz33XJqbm0esffzjH8+WLVvyxhtv5Pzzzx91zODgYAYHB4fv9/f3J3nzRZ1rhgZ/XnoEzqJz8X/jwOR18r9pb3feo6oY6evry4kTJ9LQ0DBivaGhIT09PWMe09PTM+b+48ePp6+vL7NmzRp1TFtbW9avXz9qvbGxsZpxYcKp31h6AoAz78iRI6mvrz/lz6uKkZNqampG3K9UKqPW3m7/WOsnrV27Nq2trcP3h4aG8tprr2XGjBlv+TxMDgMDA2lsbMyhQ4fOuctyMNn5931uqVQqOXLkSGbPnv2W+6qKkZkzZ2bKlCmjzoL09vaOOvtx0iWXXDLm/qlTp2bGjBljHlNbW5va2toRaxdddFE1ozIJ1NXV+Y8VTFL+fZ873uqMyElVfZpm2rRpaWpqSkdHx4j1jo6OLFq0aMxjFi5cOGr/rl27smDBgjHfLwIAnFuq/mhva2trHn/88WzdujUHDhzI6tWr09XVlZaWliRvXmJZsWLF8P6Wlpa89NJLaW1tzYEDB7J169Zs2bIla9asOXOvAgCYsKp+z8jy5ctz+PDhbNiwId3d3Zk/f37a29szZ86cJEl3d/eI7xyZO3du2tvbs3r16jzyyCOZPXt2vvrVr+a22247c6+CSaW2tjb333//qEt1wMTn3zdjqfp7RgAAziR/mwYAKEqMAABFiREAoCgxAgAUJUYAgKLG9XXwcCa9/PLL2bx5c/bs2ZOenp7U1NSkoaEhixYtSktLi79JBDDJ+WgvRT3zzDNZunRpGhsb09zcnIaGhlQqlfT29qajoyOHDh3KP/zDP2Tx4sWlRwXOsEOHDuX+++/P1q1bS49CYWKEoj7ykY/khhtuyFe+8pUxf7569eo888wz+cEPfnCWJwPebT/60Y9y3XXX5cSJE6VHoTAxQlEXXHBB9u/fn6uuumrMn//nf/5nrr322vziF784y5MB79RTTz31lj8/ePBgPve5z4kRvGeEsmbNmpU9e/acMkaee+65zJo16yxPBZwJy5YtS01NTd7q//PW1NScxYl4rxIjFLVmzZq0tLSks7MzN910UxoaGlJTU5Oenp50dHTk8ccfz8aNG0uPCYzDrFmz8sgjj2TZsmVj/nz//v1pamo6u0PxniRGKOozn/lMZsyYka985St57LHHhk/XTpkyJU1NTdm+fXtuv/32wlMC49HU1JQf/vCHp4yRtztrwrnDe0Z4z3jjjTfS19eXJJk5c2bOP//8whMB78Tu3btz9OjRfOITnxjz50ePHs3evXvzm7/5m2d5Mt5rxAgAUJRvYAUAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQ1P8ChMnWVWrGPWIAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "# Your response here"
+ "data[\"isFraud\"].value_counts().plot(kind=\"bar\")\n",
+ "plt.show()\n",
+ "\n",
+ "## Quite unbalanced"
]
},
{
@@ -60,11 +687,738 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 28,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3971530 | \n",
+ " 297 | \n",
+ " PAYMENT | \n",
+ " 825.54 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1371594 | \n",
+ " 138 | \n",
+ " PAYMENT | \n",
+ " 14862.89 | \n",
+ " 61650.00 | \n",
+ " 46787.11 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 118535 | \n",
+ " 11 | \n",
+ " CASH_OUT | \n",
+ " 53321.81 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 2042322.34 | \n",
+ " 2665856.06 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5193766 | \n",
+ " 369 | \n",
+ " PAYMENT | \n",
+ " 11216.40 | \n",
+ " 214929.59 | \n",
+ " 203713.19 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5140264 | \n",
+ " 356 | \n",
+ " CASH_OUT | \n",
+ " 117.05 | \n",
+ " 38428.86 | \n",
+ " 38311.81 | \n",
+ " 1490943.54 | \n",
+ " 1491060.59 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount oldbalanceOrg newbalanceOrig \\\n",
+ "3971530 297 PAYMENT 825.54 0.00 0.00 \n",
+ "1371594 138 PAYMENT 14862.89 61650.00 46787.11 \n",
+ "118535 11 CASH_OUT 53321.81 0.00 0.00 \n",
+ "5193766 369 PAYMENT 11216.40 214929.59 203713.19 \n",
+ "5140264 356 CASH_OUT 117.05 38428.86 38311.81 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "3971530 0.00 0.00 0 0 \n",
+ "1371594 0.00 0.00 0 0 \n",
+ "118535 2042322.34 2665856.06 0 0 \n",
+ "5193766 0.00 0.00 0 0 \n",
+ "5140264 1490943.54 1491060.59 0 0 "
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here\n",
+ "\n",
+ "## I don't understand the question about the time variable, but I'm removing the name columns as they appear to not add any value. The other\n",
+ "## string column, \"Type\" will not be dropped as it seems relevant to the understanding of the data. I'll create\n",
+ "## dummies for it.\n",
+ "\n",
+ "data.drop(columns = ([\"nameOrig\", \"nameDest\"]), inplace = True)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3971530 | \n",
+ " 297 | \n",
+ " PAYMENT | \n",
+ " 825.54 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1371594 | \n",
+ " 138 | \n",
+ " PAYMENT | \n",
+ " 14862.89 | \n",
+ " 61650.00 | \n",
+ " 46787.11 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 118535 | \n",
+ " 11 | \n",
+ " CASH_OUT | \n",
+ " 53321.81 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 2042322.34 | \n",
+ " 2665856.06 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5193766 | \n",
+ " 369 | \n",
+ " PAYMENT | \n",
+ " 11216.40 | \n",
+ " 214929.59 | \n",
+ " 203713.19 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5140264 | \n",
+ " 356 | \n",
+ " CASH_OUT | \n",
+ " 117.05 | \n",
+ " 38428.86 | \n",
+ " 38311.81 | \n",
+ " 1490943.54 | \n",
+ " 1491060.59 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount oldbalanceOrg newbalanceOrig \\\n",
+ "3971530 297 PAYMENT 825.54 0.00 0.00 \n",
+ "1371594 138 PAYMENT 14862.89 61650.00 46787.11 \n",
+ "118535 11 CASH_OUT 53321.81 0.00 0.00 \n",
+ "5193766 369 PAYMENT 11216.40 214929.59 203713.19 \n",
+ "5140264 356 CASH_OUT 117.05 38428.86 38311.81 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \\\n",
+ "3971530 0.00 0.00 0 0 \n",
+ "1371594 0.00 0.00 0 0 \n",
+ "118535 2042322.34 2665856.06 0 0 \n",
+ "5193766 0.00 0.00 0 0 \n",
+ "5140264 1490943.54 1491060.59 0 0 \n",
+ "\n",
+ " type_CASH_IN type_CASH_OUT type_DEBIT type_PAYMENT type_TRANSFER \n",
+ "3971530 0 0 0 1 0 \n",
+ "1371594 0 0 0 1 0 \n",
+ "118535 0 1 0 0 0 \n",
+ "5193766 0 0 0 1 0 \n",
+ "5140264 0 1 0 0 0 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here\n"
+ "dummies = pd.get_dummies(data['type'], prefix = \"type\")\n",
+ "data = pd.concat([data, dummies], axis=1)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3971530 | \n",
+ " 297 | \n",
+ " 825.54 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1371594 | \n",
+ " 138 | \n",
+ " 14862.89 | \n",
+ " 61650.00 | \n",
+ " 46787.11 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 118535 | \n",
+ " 11 | \n",
+ " 53321.81 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 2042322.34 | \n",
+ " 2665856.06 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5193766 | \n",
+ " 369 | \n",
+ " 11216.40 | \n",
+ " 214929.59 | \n",
+ " 203713.19 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5140264 | \n",
+ " 356 | \n",
+ " 117.05 | \n",
+ " 38428.86 | \n",
+ " 38311.81 | \n",
+ " 1490943.54 | \n",
+ " 1491060.59 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n",
+ "3971530 297 825.54 0.00 0.00 0.00 \n",
+ "1371594 138 14862.89 61650.00 46787.11 0.00 \n",
+ "118535 11 53321.81 0.00 0.00 2042322.34 \n",
+ "5193766 369 11216.40 214929.59 203713.19 0.00 \n",
+ "5140264 356 117.05 38428.86 38311.81 1490943.54 \n",
+ "\n",
+ " newbalanceDest isFraud isFlaggedFraud type_CASH_IN type_CASH_OUT \\\n",
+ "3971530 0.00 0 0 0 0 \n",
+ "1371594 0.00 0 0 0 0 \n",
+ "118535 2665856.06 0 0 0 1 \n",
+ "5193766 0.00 0 0 0 0 \n",
+ "5140264 1491060.59 0 0 0 1 \n",
+ "\n",
+ " type_DEBIT type_PAYMENT type_TRANSFER \n",
+ "3971530 0 1 0 \n",
+ "1371594 0 1 0 \n",
+ "118535 0 0 0 \n",
+ "5193766 0 1 0 \n",
+ "5140264 0 0 0 "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#data.drop([\"type\"], axis = 1, inplace = True)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train accuracy score: 0.9981973333333334\n",
+ "Test accuracy score: 0.99824\n"
+ ]
+ }
+ ],
+ "source": [
+ "## I'll create a baseline model with data like this, and then do the balancing and run the other models\n",
+ "## requested in the cells below.\n",
+ "\n",
+ "X = data.drop(columns = [\"isFraud\"])\n",
+ "y = data[\"isFraud\"]\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)\n",
+ "\n",
+ "log_reg = LogisticRegression(max_iter = 1000)\n",
+ "log_reg.fit(X_train, y_train)\n",
+ "print(\"Train accuracy score: \", log_reg.score(X_train, y_train))\n",
+ "print(\"Test accuracy score: \", log_reg.score(X_test, y_test))\n",
+ "\n",
+ "## okay I think they are not as good as they seem due to the data being so unbalanced. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[249419, 257],\n",
+ " [ 183, 141]])"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import confusion_matrix\n",
+ "\n",
+ "pred = log_reg.predict(X_test)\n",
+ "confusion_matrix(y_test, pred)\n",
+ "\n",
+ "## A lot of false negatives considering the actual number of negatives, will proceed with the balancing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(748984, 13)\n",
+ "(1016, 13)\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Oversampling of the data\n",
+ "\n",
+ "train = pd.concat([X_train,y_train], axis = 1)\n",
+ "no_fraud = train[train[\"isFraud\"]==0]\n",
+ "yes_fraud = train[train[\"isFraud\"]==1]\n",
+ "\n",
+ "print(no_fraud.shape)\n",
+ "print(yes_fraud.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(748984, 13)\n",
+ "(748984, 13)\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Oversampling in the minority\n",
+ "\n",
+ "from sklearn.utils import resample\n",
+ "\n",
+ "yes_fraud_oversampled = resample(yes_fraud, ## -- > oversample from here\n",
+ " replace = True, ## -- > we need replacement since we don't have enough datapoints. This will always be used for oversampling\n",
+ " n_samples = len(no_fraud), ## --> matching the lenghts/nÂș of observations for both yes/no diabetes\n",
+ " random_state=0) \n",
+ "\n",
+ "print(no_fraud.shape)\n",
+ "print(yes_fraud_oversampled.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1497968, 13)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFlaggedFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ " isFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6258077 | \n",
+ " 599 | \n",
+ " 3118.07 | \n",
+ " 691532.00 | \n",
+ " 688413.93 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 408921 | \n",
+ " 18 | \n",
+ " 88732.48 | \n",
+ " 108.00 | \n",
+ " 88840.48 | \n",
+ " 272817.72 | \n",
+ " 184085.24 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6300210 | \n",
+ " 681 | \n",
+ " 454.83 | \n",
+ " 122.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5400141 | \n",
+ " 377 | \n",
+ " 14467.89 | \n",
+ " 336004.27 | \n",
+ " 321536.38 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2048181 | \n",
+ " 181 | \n",
+ " 487887.45 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 563517.54 | \n",
+ " 1051404.99 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n",
+ "6258077 599 3118.07 691532.00 688413.93 0.00 \n",
+ "408921 18 88732.48 108.00 88840.48 272817.72 \n",
+ "6300210 681 454.83 122.00 0.00 0.00 \n",
+ "5400141 377 14467.89 336004.27 321536.38 0.00 \n",
+ "2048181 181 487887.45 0.00 0.00 563517.54 \n",
+ "\n",
+ " newbalanceDest isFlaggedFraud type_CASH_IN type_CASH_OUT \\\n",
+ "6258077 0.00 0 0 0 \n",
+ "408921 184085.24 0 1 0 \n",
+ "6300210 0.00 0 0 0 \n",
+ "5400141 0.00 0 0 0 \n",
+ "2048181 1051404.99 0 0 0 \n",
+ "\n",
+ " type_DEBIT type_PAYMENT type_TRANSFER isFraud \n",
+ "6258077 0 1 0 0 \n",
+ "408921 0 0 0 0 \n",
+ "6300210 0 1 0 0 \n",
+ "5400141 0 1 0 0 \n",
+ "2048181 0 0 1 0 "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_over = pd.concat([no_fraud, yes_fraud_oversampled])\n",
+ "print(train_over.shape)\n",
+ "train_over.head()"
]
},
{
@@ -76,11 +1430,55 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 44,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train accuracy score: 0.9252046772694744\n",
+ "Test accuracy score: 0.935416\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "\n",
+ "## Data is now balanced, will run a logistic regression\n",
+ "\n",
+ "X_train_over = train_over.drop(columns = [\"isFraud\"], axis=1)\n",
+ "y_train_over = train_over[\"isFraud\"]\n",
+ "\n",
+ "log_reg = LogisticRegression(max_iter = 1000)\n",
+ "log_reg.fit(X_train_over, y_train_over)\n",
+ "print(\"Train accuracy score: \", log_reg.score(X_train_over, y_train_over))\n",
+ "print(\"Test accuracy score: \", log_reg.score(X_test, y_test))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[233568, 16108],\n",
+ " [ 38, 286]])"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pred = log_reg.predict(X_test)\n",
+ "confusion_matrix(y_test, pred)\n",
+ "\n",
+ "## Although accuracy decreased, there are considerably less false negatives.\n",
+ "## From what I understand that is a relevant improvement"
]
},
{
@@ -92,11 +1490,56 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 49,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "test data accuracy is: 0.991852\n",
+ "training data accuracy is: 0.9959631981457547\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Your code here\n",
+ "\n",
+ "## Going with Decision Tree\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "\n",
+ "## Initializing model\n",
+ "tree = DecisionTreeClassifier(max_depth=10) ## Max depth means we want to make 10 splits/levels at maximum\n",
+ "\n",
+ "## Training the model\n",
+ "tree.fit(X_train_over,y_train_over)\n",
+ "\n",
+ "##Score/eveluation\n",
+ "\n",
+ "print(\"test data accuracy is:\", tree.score(X_test,y_test))\n",
+ "print(\"training data accuracy is:\", tree.score(X_train_over,y_train_over))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[247662, 2014],\n",
+ " [ 23, 301]])"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "pred = tree.predict(X_test)\n",
+ "confusion_matrix(y_test, pred)"
]
},
{
@@ -108,11 +1551,79 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "# Your response here\n",
+ "\n",
+ "## I believe the decision tree model after the balancing is the best alternative. The accuracy score is high\n",
+ "## and we decreased the numbers of false negatives a lot, which is important considering the fraud/no fraud scenario."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "recall 0.8827160493827161\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 0.94 0.97 249676\n",
+ " 1 0.02 0.88 0.03 324\n",
+ "\n",
+ " accuracy 0.94 250000\n",
+ " macro avg 0.51 0.91 0.50 250000\n",
+ "weighted avg 1.00 0.94 0.97 250000\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Using other metrics for evaluation\n",
+ "\n",
+ "from sklearn.metrics import classification_report\n",
+ "from sklearn.metrics import recall_score\n",
+ "\n",
+ "## Logistic regression other metrics\n",
+ "\n",
+ "pred = log_reg.predict(X_test)\n",
+ "print(\"recall\", recall_score(y_test, pred))\n",
+ "print(classification_report(y_test,pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "recall 0.9290123456790124\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 0.99 1.00 249676\n",
+ " 1 0.13 0.93 0.23 324\n",
+ "\n",
+ " accuracy 0.99 250000\n",
+ " macro avg 0.56 0.96 0.61 250000\n",
+ "weighted avg 1.00 0.99 0.99 250000\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Decision tree other metrics\n",
+ "\n",
+ "pred = tree.predict(X_test)\n",
+ "print(\"recall\", recall_score(y_test, pred))\n",
+ "print(classification_report(y_test,pred))"
]
},
{
@@ -125,7 +1636,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -139,7 +1650,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.13"
}
},
"nbformat": 4,