diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..93181cb 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,597 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "## Importing libraries\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import imblearn\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.linear_model import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000000, 11)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
3971530297PAYMENT825.54C19378761420.000.00M10227383700.000.0000
1371594138PAYMENT14862.89C6440146661650.0046787.11M9594680270.000.0000
11853511CASH_OUT53321.81C9296437350.000.00C4090605212042322.342665856.0600
5193766369PAYMENT11216.40C141620937214929.59203713.19M4240374230.000.0000
5140264356CASH_OUT117.05C21889877738428.8638311.81C1608405371490943.541491060.5900
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n", + "3971530 297 PAYMENT 825.54 C1937876142 0.00 0.00 \n", + "1371594 138 PAYMENT 14862.89 C64401466 61650.00 46787.11 \n", + "118535 11 CASH_OUT 53321.81 C929643735 0.00 0.00 \n", + "5193766 369 PAYMENT 11216.40 C141620937 214929.59 203713.19 \n", + "5140264 356 CASH_OUT 117.05 C218898777 38428.86 38311.81 \n", + "\n", + " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "3971530 M1022738370 0.00 0.00 0 0 \n", + "1371594 M959468027 0.00 0.00 0 0 \n", + "118535 C409060521 2042322.34 2665856.06 0 0 \n", + "5193766 M424037423 0.00 0.00 0 0 \n", + "5140264 C160840537 1490943.54 1491060.59 0 0 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"/Users/caionunez/Downloads/PS_20174392719_1491204439457_log.csv\").sample(1000000)\n", + "print(data.shape)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
count1000000.0000001.000000e+061.000000e+061.000000e+061.000000e+061.000000e+061000000.0000001000000.000000
mean243.4055101.806395e+058.328807e+058.543091e+051.099738e+061.224374e+060.0013400.000001
std142.4769616.185435e+052.883078e+062.918763e+063.388608e+063.676593e+060.0365810.001000
min1.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.000000
25%155.0000001.341511e+040.000000e+000.000000e+000.000000e+000.000000e+000.0000000.000000
50%239.0000007.503728e+041.424776e+040.000000e+001.329255e+052.138667e+050.0000000.000000
75%335.0000002.089625e+051.072873e+051.451902e+059.445696e+051.112732e+060.0000000.000000
max743.0000006.384799e+075.958504e+074.958504e+073.249151e+083.555534e+081.0000001.000000
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "count 1000000.000000 1.000000e+06 1.000000e+06 1.000000e+06 \n", + "mean 243.405510 1.806395e+05 8.328807e+05 8.543091e+05 \n", + "std 142.476961 6.185435e+05 2.883078e+06 2.918763e+06 \n", + "min 1.000000 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "25% 155.000000 1.341511e+04 0.000000e+00 0.000000e+00 \n", + "50% 239.000000 7.503728e+04 1.424776e+04 0.000000e+00 \n", + "75% 335.000000 2.089625e+05 1.072873e+05 1.451902e+05 \n", + "max 743.000000 6.384799e+07 5.958504e+07 4.958504e+07 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "count 1.000000e+06 1.000000e+06 1000000.000000 1000000.000000 \n", + "mean 1.099738e+06 1.224374e+06 0.001340 0.000001 \n", + "std 3.388608e+06 3.676593e+06 0.036581 0.001000 \n", + "min 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", + "25% 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", + "50% 1.329255e+05 2.138667e+05 0.000000 0.000000 \n", + "75% 9.445696e+05 1.112732e+06 0.000000 0.000000 \n", + "max 3.249151e+08 3.555534e+08 1.000000 1.000000 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## I'm thinking the outcome mentioned is the represented in the column \"isFraud\"\n", + "\n", + "data.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 998660\n", + "1 1340\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"isFraud\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 999999\n", + "1 1\n", + "Name: isFlaggedFraud, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"isFlaggedFraud\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CASH_OUT 350876\n", + "PAYMENT 337961\n", + "CASH_IN 220737\n", + "TRANSFER 83854\n", + "DEBIT 6572\n", + "Name: type, dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
step1.0000000.022152-0.010426-0.0107680.0261990.0246530.0328840.002180
amount0.0221521.000000-0.002622-0.0075680.3053810.4721490.0716740.005490
oldbalanceOrg-0.010426-0.0026221.0000000.9988070.0676110.0430070.0099040.000952
newbalanceOrig-0.010768-0.0075680.9988071.0000000.0691790.042784-0.0079040.000933
oldbalanceDest0.0261990.3053810.0676110.0691791.0000000.976190-0.006897-0.000325
newbalanceDest0.0246530.4721490.0430070.0427840.9761901.000000-0.000392-0.000333
isFraud0.0328840.0716740.009904-0.007904-0.006897-0.0003921.0000000.027300
isFlaggedFraud0.0021800.0054900.0009520.000933-0.000325-0.0003330.0273001.000000
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "step 1.000000 0.022152 -0.010426 -0.010768 \n", + "amount 0.022152 1.000000 -0.002622 -0.007568 \n", + "oldbalanceOrg -0.010426 -0.002622 1.000000 0.998807 \n", + "newbalanceOrig -0.010768 -0.007568 0.998807 1.000000 \n", + "oldbalanceDest 0.026199 0.305381 0.067611 0.069179 \n", + "newbalanceDest 0.024653 0.472149 0.043007 0.042784 \n", + "isFraud 0.032884 0.071674 0.009904 -0.007904 \n", + "isFlaggedFraud 0.002180 0.005490 0.000952 0.000933 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "step 0.026199 0.024653 0.032884 0.002180 \n", + "amount 0.305381 0.472149 0.071674 0.005490 \n", + "oldbalanceOrg 0.067611 0.043007 0.009904 0.000952 \n", + "newbalanceOrig 0.069179 0.042784 -0.007904 0.000933 \n", + "oldbalanceDest 1.000000 0.976190 -0.006897 -0.000325 \n", + "newbalanceDest 0.976190 1.000000 -0.000392 -0.000333 \n", + "isFraud -0.006897 -0.000392 1.000000 0.027300 \n", + "isFlaggedFraud -0.000325 -0.000333 0.027300 1.000000 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.corr()\n", + "\n", + "## None of the columns appear to be highly correlated" ] }, { @@ -44,11 +630,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 998660\n", + "1 1340\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your response here\n", + "\n", + "## Briging the value counts for the column again. I'm thinking that with this result, considering it's the\n", + "## imbalance labs, we should do balance them out before proceeding.\n", + "\n", + "data[\"isFraud\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGnCAYAAABl41fiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAan0lEQVR4nO3db2yd5X3/8Y9JiMNKbUSymqQ1IbCBooUycFqWhGijG24DQorERia0hjKQapU1S9xUI81UlgjN7QQ060oCFUmjSBmLWP+IrV4XS9NKIExq3KT7Q7Z2hOIAdi0HzQ5p5ZD47AG/ePLPDuSYkAs7r5d0HpzL133O90gNfve+zzmuqVQqlQAAFHJe6QEAgHObGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIqaUDHy9NNP59Zbb83s2bNTU1OT73znO1U/RqVSyYMPPpgrr7wytbW1aWxszJ//+Z+f+WEBgNMytfQA1Th69Giuueaa3HXXXbntttvG9Rh//Md/nF27duXBBx/M1Vdfnf7+/vT19Z3hSQGA01UzUf9QXk1NTb797W9n2bJlw2vHjh3Ln/7pn2bHjh35n//5n8yfPz9f/vKX81u/9VtJkgMHDuTDH/5w/v3f/z1XXXVVmcEBgBEm1GWat3PXXXfl2Wefzd/8zd/kX//1X/N7v/d7+cQnPpGf/OQnSZK/+7u/y+WXX56///u/z9y5c3PZZZflnnvuyWuvvVZ4cgA4d02aGHnhhRfyxBNP5Mknn8ySJUtyxRVXZM2aNbnhhhvyjW98I0ly8ODBvPTSS3nyySezffv2bNu2LZ2dnfnd3/3dwtMDwLlrQr1n5K388Ic/TKVSyZVXXjlifXBwMDNmzEiSDA0NZXBwMNu3bx/et2XLljQ1NeW//uu/XLoBgAImTYwMDQ1lypQp6ezszJQpU0b87MILL0ySzJo1K1OnTh0RLPPmzUuSdHV1iREAKGDSxMi1116bEydOpLe3N0uWLBlzz+LFi3P8+PG88MILueKKK5IkP/7xj5Mkc+bMOWuzAgD/Z0J9mub111/Pf//3fyd5Mz4efvjh3Hjjjbn44otz6aWX5g/+4A/y7LPP5qGHHsq1116bvr6+/NM//VOuvvrq3HzzzRkaGspHPvKRXHjhhdm4cWOGhoZy7733pq6uLrt27Sr86gDg3DShYuSf//mfc+ONN45av/POO7Nt27a88cYbeeCBB7J9+/a88sormTFjRhYuXJj169fn6quvTpK8+uqr+exnP5tdu3blfe97X5YuXZqHHnooF1988dl+OQBAJliMAACTz6T5aC8AMDGJEQCgqAnxaZqhoaG8+uqref/735+amprS4wAAp6FSqeTIkSOZPXt2zjvv1Oc/JkSMvPrqq2lsbCw9BgAwDocOHcqHPvShU/58QsTI+9///iRvvpi6urrC0wAAp2NgYCCNjY3Dv8dPZULEyMlLM3V1dWIEACaYt3uLhTewAgBFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKCoqmPk6aefzq233prZs2enpqYm3/nOd972mO9///tpamrK9OnTc/nll+fRRx8dz6wAwCRUdYwcPXo011xzTb72ta+d1v4XX3wxN998c5YsWZJ9+/blC1/4QlauXJlvfvObVQ8LAEw+Vf+hvKVLl2bp0qWnvf/RRx/NpZdemo0bNyZJ5s2bl7179+bBBx/MbbfdVu3TAwCTzLv+npHnnnsuzc3NI9Y+/vGPZ+/evXnjjTfGPGZwcDADAwMjbgDA5FT1mZFq9fT0pKGhYcRaQ0NDjh8/nr6+vsyaNWvUMW1tbVm/fv27PdqEcNl93y09AmfRT790S+kRAM66s/JpmpqamhH3K5XKmOsnrV27Nv39/cO3Q4cOveszAgBlvOtnRi655JL09PSMWOvt7c3UqVMzY8aMMY+pra1NbW3tuz0aAPAe8K6fGVm4cGE6OjpGrO3atSsLFizI+eef/24/PQDwHld1jLz++uvZv39/9u/fn+TNj+7u378/XV1dSd68xLJixYrh/S0tLXnppZfS2tqaAwcOZOvWrdmyZUvWrFlzZl4BADChVX2ZZu/evbnxxhuH77e2tiZJ7rzzzmzbti3d3d3DYZIkc+fOTXt7e1avXp1HHnkks2fPzle/+lUf6wUAkiQ1lZPvJn0PGxgYSH19ffr7+1NXV1d6nLPKp2nOLT5NA0wmp/v729+mAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKGpcMbJp06bMnTs306dPT1NTU3bv3v2W+3fs2JFrrrkmv/RLv5RZs2blrrvuyuHDh8c1MAAwuVQdIzt37syqVauybt267Nu3L0uWLMnSpUvT1dU15v5nnnkmK1asyN13353/+I//yJNPPpkf/OAHueeee97x8ADAxFd1jDz88MO5++67c88992TevHnZuHFjGhsbs3nz5jH3/8u//Esuu+yyrFy5MnPnzs0NN9yQT3/609m7d+87Hh4AmPiqipFjx46ls7Mzzc3NI9abm5uzZ8+eMY9ZtGhRXn755bS3t6dSqeRnP/tZ/vZv/za33HLLKZ9ncHAwAwMDI24AwORUVYz09fXlxIkTaWhoGLHe0NCQnp6eMY9ZtGhRduzYkeXLl2fatGm55JJLctFFF+Wv/uqvTvk8bW1tqa+vH741NjZWMyYAMIGM6w2sNTU1I+5XKpVRayc9//zzWblyZb74xS+ms7Mz3/ve9/Liiy+mpaXllI+/du3a9Pf3D98OHTo0njEBgAlgajWbZ86cmSlTpow6C9Lb2zvqbMlJbW1tWbx4cT7/+c8nST784Q/nfe97X5YsWZIHHnggs2bNGnVMbW1tamtrqxkNAJigqjozMm3atDQ1NaWjo2PEekdHRxYtWjTmMT//+c9z3nkjn2bKlClJ3jyjAgCc26q+TNPa2prHH388W7duzYEDB7J69ep0dXUNX3ZZu3ZtVqxYMbz/1ltvzbe+9a1s3rw5Bw8ezLPPPpuVK1fmox/9aGbPnn3mXgkAMCFVdZkmSZYvX57Dhw9nw4YN6e7uzvz589Pe3p45c+YkSbq7u0d858inPvWpHDlyJF/72tfyuc99LhdddFE+9rGP5ctf/vKZexUAwIRVU5kA10oGBgZSX1+f/v7+1NXVlR7nrLrsvu+WHoGz6KdfOvVH3gEmmtP9/e1v0wAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosYVI5s2bcrcuXMzffr0NDU1Zffu3W+5f3BwMOvWrcucOXNSW1ubK664Ilu3bh3XwADA5DK12gN27tyZVatWZdOmTVm8eHEee+yxLF26NM8//3wuvfTSMY+5/fbb87Of/SxbtmzJr/zKr6S3tzfHjx9/x8MDABNfTaVSqVRzwPXXX5/rrrsumzdvHl6bN29eli1blra2tlH7v/e97+X3f//3c/DgwVx88cXjGnJgYCD19fXp7+9PXV3duB5jorrsvu+WHoGz6KdfuqX0CABnzOn+/q7qMs2xY8fS2dmZ5ubmEevNzc3Zs2fPmMc89dRTWbBgQf7iL/4iH/zgB3PllVdmzZo1+cUvfnHK5xkcHMzAwMCIGwAwOVV1maavry8nTpxIQ0PDiPWGhob09PSMeczBgwfzzDPPZPr06fn2t7+dvr6+fOYzn8lrr712yveNtLW1Zf369dWMBgBMUON6A2tNTc2I+5VKZdTaSUNDQ6mpqcmOHTvy0Y9+NDfffHMefvjhbNu27ZRnR9auXZv+/v7h26FDh8YzJgAwAVR1ZmTmzJmZMmXKqLMgvb29o86WnDRr1qx88IMfTH19/fDavHnzUqlU8vLLL+dXf/VXRx1TW1ub2traakYDACaoqs6MTJs2LU1NTeno6Bix3tHRkUWLFo15zOLFi/Pqq6/m9ddfH1778Y9/nPPOOy8f+tCHxjEyADCZVH2ZprW1NY8//ni2bt2aAwcOZPXq1enq6kpLS0uSNy+xrFixYnj/HXfckRkzZuSuu+7K888/n6effjqf//zn84d/+Ie54IILztwrAQAmpKq/Z2T58uU5fPhwNmzYkO7u7syfPz/t7e2ZM2dOkqS7uztdXV3D+y+88MJ0dHTks5/9bBYsWJAZM2bk9ttvzwMPPHDmXgUAMGFV/T0jJfieEc4VvmcEmEzele8ZAQA408QIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoaV4xs2rQpc+fOzfTp09PU1JTdu3ef1nHPPvtspk6dml//9V8fz9MCAJNQ1TGyc+fOrFq1KuvWrcu+ffuyZMmSLF26NF1dXW95XH9/f1asWJHf/u3fHvewAMDkU3WMPPzww7n77rtzzz33ZN68edm4cWMaGxuzefPmtzzu05/+dO64444sXLhw3MMCAJNPVTFy7NixdHZ2prm5ecR6c3Nz9uzZc8rjvvGNb+SFF17I/ffff1rPMzg4mIGBgRE3AGByqipG+vr6cuLEiTQ0NIxYb2hoSE9Pz5jH/OQnP8l9992XHTt2ZOrUqaf1PG1tbamvrx++NTY2VjMmADCBjOsNrDU1NSPuVyqVUWtJcuLEidxxxx1Zv359rrzyytN+/LVr16a/v3/4dujQofGMCQBMAKd3quL/mTlzZqZMmTLqLEhvb++osyVJcuTIkezduzf79u3LH/3RHyVJhoaGUqlUMnXq1OzatSsf+9jHRh1XW1ub2traakYDACaoqs6MTJs2LU1NTeno6Bix3tHRkUWLFo3aX1dXl3/7t3/L/v37h28tLS256qqrsn///lx//fXvbHoAYMKr6sxIkrS2tuaTn/xkFixYkIULF+brX/96urq60tLSkuTNSyyvvPJKtm/fnvPOOy/z588fcfwHPvCBTJ8+fdQ6AHBuqjpGli9fnsOHD2fDhg3p7u7O/Pnz097enjlz5iRJuru73/Y7RwAATqqpVCqV0kO8nYGBgdTX16e/vz91dXWlxzmrLrvvu6VH4Cz66ZduKT0CwBlzur+//W0aAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFDUuGJk06ZNmTt3bqZPn56mpqbs3r37lHu/9a1v5aabbsov//Ivp66uLgsXLsw//uM/jntgAGByqTpGdu7cmVWrVmXdunXZt29flixZkqVLl6arq2vM/U8//XRuuummtLe3p7OzMzfeeGNuvfXW7Nu37x0PDwBMfDWVSqVSzQHXX399rrvuumzevHl4bd68eVm2bFna2tpO6zF+7dd+LcuXL88Xv/jF09o/MDCQ+vr69Pf3p66urppxJ7zL7vtu6RE4i376pVtKjwBwxpzu7++qzowcO3YsnZ2daW5uHrHe3NycPXv2nNZjDA0N5ciRI7n44otPuWdwcDADAwMjbgDA5FRVjPT19eXEiRNpaGgYsd7Q0JCenp7TeoyHHnooR48eze23337KPW1tbamvrx++NTY2VjMmADCBjOsNrDU1NSPuVyqVUWtjeeKJJ/Jnf/Zn2blzZz7wgQ+cct/atWvT398/fDt06NB4xgQAJoCp1WyeOXNmpkyZMuosSG9v76izJf+/nTt35u67786TTz6Z3/md33nLvbW1tamtra1mNABggqrqzMi0adPS1NSUjo6OEesdHR1ZtGjRKY974okn8qlPfSp//dd/nVtu8QY9AOD/VHVmJElaW1vzyU9+MgsWLMjChQvz9a9/PV1dXWlpaUny5iWWV155Jdu3b0/yZoisWLEif/mXf5nf+I3fGD6rcsEFF6S+vv4MvhQAYCKqOkaWL1+ew4cPZ8OGDenu7s78+fPT3t6eOXPmJEm6u7tHfOfIY489luPHj+fee+/NvffeO7x+5513Ztu2be/8FQAAE1rV3zNSgu8Z4Vzhe0aAyeRd+Z4RAIAzTYwAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQlBgBAIoSIwBAUWIEAChKjAAARYkRAKAoMQIAFCVGAICixAgAUJQYAQCKEiMAQFFiBAAoSowAAEWJEQCgKDECABQlRgCAosQIAFCUGAEAihIjAEBRYgQAKEqMAABFiREAoKhxxcimTZsyd+7cTJ8+PU1NTdm9e/db7v/+97+fpqamTJ8+PZdffnkeffTRcQ0LAEw+VcfIzp07s2rVqqxbty779u3LkiVLsnTp0nR1dY25/8UXX8zNN9+cJUuWZN++ffnCF76QlStX5pvf/OY7Hh4AmPhqKpVKpZoDrr/++lx33XXZvHnz8Nq8efOybNmytLW1jdr/J3/yJ3nqqady4MCB4bWWlpb86Ec/ynPPPXdazzkwMJD6+vr09/enrq6umnEnvMvu+27pETiLfvqlW0qPAHDGnO7v76nVPOixY8fS2dmZ++67b8R6c3Nz9uzZM+Yxzz33XJqbm0esffzjH8+WLVvyxhtv5Pzzzx91zODgYAYHB4fv9/f3J3nzRZ1rhgZ/XnoEzqJz8X/jwOR18r9pb3feo6oY6evry4kTJ9LQ0DBivaGhIT09PWMe09PTM+b+48ePp6+vL7NmzRp1TFtbW9avXz9qvbGxsZpxYcKp31h6AoAz78iRI6mvrz/lz6uKkZNqampG3K9UKqPW3m7/WOsnrV27Nq2trcP3h4aG8tprr2XGjBlv+TxMDgMDA2lsbMyhQ4fOuctyMNn5931uqVQqOXLkSGbPnv2W+6qKkZkzZ2bKlCmjzoL09vaOOvtx0iWXXDLm/qlTp2bGjBljHlNbW5va2toRaxdddFE1ozIJ1NXV+Y8VTFL+fZ873uqMyElVfZpm2rRpaWpqSkdHx4j1jo6OLFq0aMxjFi5cOGr/rl27smDBgjHfLwIAnFuq/mhva2trHn/88WzdujUHDhzI6tWr09XVlZaWliRvXmJZsWLF8P6Wlpa89NJLaW1tzYEDB7J169Zs2bIla9asOXOvAgCYsKp+z8jy5ctz+PDhbNiwId3d3Zk/f37a29szZ86cJEl3d/eI7xyZO3du2tvbs3r16jzyyCOZPXt2vvrVr+a22247c6+CSaW2tjb333//qEt1wMTn3zdjqfp7RgAAziR/mwYAKEqMAABFiREAoCgxAgAUJUYAgKLG9XXwcCa9/PLL2bx5c/bs2ZOenp7U1NSkoaEhixYtSktLi79JBDDJ+WgvRT3zzDNZunRpGhsb09zcnIaGhlQqlfT29qajoyOHDh3KP/zDP2Tx4sWlRwXOsEOHDuX+++/P1q1bS49CYWKEoj7ykY/khhtuyFe+8pUxf7569eo888wz+cEPfnCWJwPebT/60Y9y3XXX5cSJE6VHoTAxQlEXXHBB9u/fn6uuumrMn//nf/5nrr322vziF784y5MB79RTTz31lj8/ePBgPve5z4kRvGeEsmbNmpU9e/acMkaee+65zJo16yxPBZwJy5YtS01NTd7q//PW1NScxYl4rxIjFLVmzZq0tLSks7MzN910UxoaGlJTU5Oenp50dHTk8ccfz8aNG0uPCYzDrFmz8sgjj2TZsmVj/nz//v1pamo6u0PxniRGKOozn/lMZsyYka985St57LHHhk/XTpkyJU1NTdm+fXtuv/32wlMC49HU1JQf/vCHp4yRtztrwrnDe0Z4z3jjjTfS19eXJJk5c2bOP//8whMB78Tu3btz9OjRfOITnxjz50ePHs3evXvzm7/5m2d5Mt5rxAgAUJRvYAUAihIjAEBRYgQAKEqMAABFiREAoCgxAgAUJUYAgKLECABQ1P8ChMnWVWrGPWIAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your response here" + "data[\"isFraud\"].value_counts().plot(kind=\"bar\")\n", + "plt.show()\n", + "\n", + "## Quite unbalanced" ] }, { @@ -60,11 +687,738 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
3971530297PAYMENT825.540.000.000.000.0000
1371594138PAYMENT14862.8961650.0046787.110.000.0000
11853511CASH_OUT53321.810.000.002042322.342665856.0600
5193766369PAYMENT11216.40214929.59203713.190.000.0000
5140264356CASH_OUT117.0538428.8638311.811490943.541491060.5900
\n", + "
" + ], + "text/plain": [ + " step type amount oldbalanceOrg newbalanceOrig \\\n", + "3971530 297 PAYMENT 825.54 0.00 0.00 \n", + "1371594 138 PAYMENT 14862.89 61650.00 46787.11 \n", + "118535 11 CASH_OUT 53321.81 0.00 0.00 \n", + "5193766 369 PAYMENT 11216.40 214929.59 203713.19 \n", + "5140264 356 CASH_OUT 117.05 38428.86 38311.81 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "3971530 0.00 0.00 0 0 \n", + "1371594 0.00 0.00 0 0 \n", + "118535 2042322.34 2665856.06 0 0 \n", + "5193766 0.00 0.00 0 0 \n", + "5140264 1490943.54 1491060.59 0 0 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here\n", + "\n", + "## I don't understand the question about the time variable, but I'm removing the name columns as they appear to not add any value. The other\n", + "## string column, \"Type\" will not be dropped as it seems relevant to the understanding of the data. I'll create\n", + "## dummies for it.\n", + "\n", + "data.drop(columns = ([\"nameOrig\", \"nameDest\"]), inplace = True)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraudtype_CASH_INtype_CASH_OUTtype_DEBITtype_PAYMENTtype_TRANSFER
3971530297PAYMENT825.540.000.000.000.000000010
1371594138PAYMENT14862.8961650.0046787.110.000.000000010
11853511CASH_OUT53321.810.000.002042322.342665856.060001000
5193766369PAYMENT11216.40214929.59203713.190.000.000000010
5140264356CASH_OUT117.0538428.8638311.811490943.541491060.590001000
\n", + "
" + ], + "text/plain": [ + " step type amount oldbalanceOrg newbalanceOrig \\\n", + "3971530 297 PAYMENT 825.54 0.00 0.00 \n", + "1371594 138 PAYMENT 14862.89 61650.00 46787.11 \n", + "118535 11 CASH_OUT 53321.81 0.00 0.00 \n", + "5193766 369 PAYMENT 11216.40 214929.59 203713.19 \n", + "5140264 356 CASH_OUT 117.05 38428.86 38311.81 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \\\n", + "3971530 0.00 0.00 0 0 \n", + "1371594 0.00 0.00 0 0 \n", + "118535 2042322.34 2665856.06 0 0 \n", + "5193766 0.00 0.00 0 0 \n", + "5140264 1490943.54 1491060.59 0 0 \n", + "\n", + " type_CASH_IN type_CASH_OUT type_DEBIT type_PAYMENT type_TRANSFER \n", + "3971530 0 0 0 1 0 \n", + "1371594 0 0 0 1 0 \n", + "118535 0 1 0 0 0 \n", + "5193766 0 0 0 1 0 \n", + "5140264 0 1 0 0 0 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here\n" + "dummies = pd.get_dummies(data['type'], prefix = \"type\")\n", + "data = pd.concat([data, dummies], axis=1)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraudtype_CASH_INtype_CASH_OUTtype_DEBITtype_PAYMENTtype_TRANSFER
3971530297825.540.000.000.000.000000010
137159413814862.8961650.0046787.110.000.000000010
1185351153321.810.000.002042322.342665856.060001000
519376636911216.40214929.59203713.190.000.000000010
5140264356117.0538428.8638311.811490943.541491060.590001000
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n", + "3971530 297 825.54 0.00 0.00 0.00 \n", + "1371594 138 14862.89 61650.00 46787.11 0.00 \n", + "118535 11 53321.81 0.00 0.00 2042322.34 \n", + "5193766 369 11216.40 214929.59 203713.19 0.00 \n", + "5140264 356 117.05 38428.86 38311.81 1490943.54 \n", + "\n", + " newbalanceDest isFraud isFlaggedFraud type_CASH_IN type_CASH_OUT \\\n", + "3971530 0.00 0 0 0 0 \n", + "1371594 0.00 0 0 0 0 \n", + "118535 2665856.06 0 0 0 1 \n", + "5193766 0.00 0 0 0 0 \n", + "5140264 1491060.59 0 0 0 1 \n", + "\n", + " type_DEBIT type_PAYMENT type_TRANSFER \n", + "3971530 0 1 0 \n", + "1371594 0 1 0 \n", + "118535 0 0 0 \n", + "5193766 0 1 0 \n", + "5140264 0 0 0 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#data.drop([\"type\"], axis = 1, inplace = True)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train accuracy score: 0.9981973333333334\n", + "Test accuracy score: 0.99824\n" + ] + } + ], + "source": [ + "## I'll create a baseline model with data like this, and then do the balancing and run the other models\n", + "## requested in the cells below.\n", + "\n", + "X = data.drop(columns = [\"isFraud\"])\n", + "y = data[\"isFraud\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)\n", + "\n", + "log_reg = LogisticRegression(max_iter = 1000)\n", + "log_reg.fit(X_train, y_train)\n", + "print(\"Train accuracy score: \", log_reg.score(X_train, y_train))\n", + "print(\"Test accuracy score: \", log_reg.score(X_test, y_test))\n", + "\n", + "## okay I think they are not as good as they seem due to the data being so unbalanced. " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[249419, 257],\n", + " [ 183, 141]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "pred = log_reg.predict(X_test)\n", + "confusion_matrix(y_test, pred)\n", + "\n", + "## A lot of false negatives considering the actual number of negatives, will proceed with the balancing" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(748984, 13)\n", + "(1016, 13)\n" + ] + } + ], + "source": [ + "## Oversampling of the data\n", + "\n", + "train = pd.concat([X_train,y_train], axis = 1)\n", + "no_fraud = train[train[\"isFraud\"]==0]\n", + "yes_fraud = train[train[\"isFraud\"]==1]\n", + "\n", + "print(no_fraud.shape)\n", + "print(yes_fraud.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(748984, 13)\n", + "(748984, 13)\n" + ] + } + ], + "source": [ + "## Oversampling in the minority\n", + "\n", + "from sklearn.utils import resample\n", + "\n", + "yes_fraud_oversampled = resample(yes_fraud, ## -- > oversample from here\n", + " replace = True, ## -- > we need replacement since we don't have enough datapoints. This will always be used for oversampling\n", + " n_samples = len(no_fraud), ## --> matching the lenghts/nÂș of observations for both yes/no diabetes\n", + " random_state=0) \n", + "\n", + "print(no_fraud.shape)\n", + "print(yes_fraud_oversampled.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1497968, 13)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFlaggedFraudtype_CASH_INtype_CASH_OUTtype_DEBITtype_PAYMENTtype_TRANSFERisFraud
62580775993118.07691532.00688413.930.000.000000100
4089211888732.48108.0088840.48272817.72184085.240100000
6300210681454.83122.000.000.000.000000100
540014137714467.89336004.27321536.380.000.000000100
2048181181487887.450.000.00563517.541051404.990000010
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n", + "6258077 599 3118.07 691532.00 688413.93 0.00 \n", + "408921 18 88732.48 108.00 88840.48 272817.72 \n", + "6300210 681 454.83 122.00 0.00 0.00 \n", + "5400141 377 14467.89 336004.27 321536.38 0.00 \n", + "2048181 181 487887.45 0.00 0.00 563517.54 \n", + "\n", + " newbalanceDest isFlaggedFraud type_CASH_IN type_CASH_OUT \\\n", + "6258077 0.00 0 0 0 \n", + "408921 184085.24 0 1 0 \n", + "6300210 0.00 0 0 0 \n", + "5400141 0.00 0 0 0 \n", + "2048181 1051404.99 0 0 0 \n", + "\n", + " type_DEBIT type_PAYMENT type_TRANSFER isFraud \n", + "6258077 0 1 0 0 \n", + "408921 0 0 0 0 \n", + "6300210 0 1 0 0 \n", + "5400141 0 1 0 0 \n", + "2048181 0 0 1 0 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_over = pd.concat([no_fraud, yes_fraud_oversampled])\n", + "print(train_over.shape)\n", + "train_over.head()" ] }, { @@ -76,11 +1430,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train accuracy score: 0.9252046772694744\n", + "Test accuracy score: 0.935416\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "## Data is now balanced, will run a logistic regression\n", + "\n", + "X_train_over = train_over.drop(columns = [\"isFraud\"], axis=1)\n", + "y_train_over = train_over[\"isFraud\"]\n", + "\n", + "log_reg = LogisticRegression(max_iter = 1000)\n", + "log_reg.fit(X_train_over, y_train_over)\n", + "print(\"Train accuracy score: \", log_reg.score(X_train_over, y_train_over))\n", + "print(\"Test accuracy score: \", log_reg.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[233568, 16108],\n", + " [ 38, 286]])" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred = log_reg.predict(X_test)\n", + "confusion_matrix(y_test, pred)\n", + "\n", + "## Although accuracy decreased, there are considerably less false negatives.\n", + "## From what I understand that is a relevant improvement" ] }, { @@ -92,11 +1490,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test data accuracy is: 0.991852\n", + "training data accuracy is: 0.9959631981457547\n" + ] + } + ], + "source": [ + "# Your code here\n", + "\n", + "## Going with Decision Tree\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "## Initializing model\n", + "tree = DecisionTreeClassifier(max_depth=10) ## Max depth means we want to make 10 splits/levels at maximum\n", + "\n", + "## Training the model\n", + "tree.fit(X_train_over,y_train_over)\n", + "\n", + "##Score/eveluation\n", + "\n", + "print(\"test data accuracy is:\", tree.score(X_test,y_test))\n", + "print(\"training data accuracy is:\", tree.score(X_train_over,y_train_over))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[247662, 2014],\n", + " [ 23, 301]])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "pred = tree.predict(X_test)\n", + "confusion_matrix(y_test, pred)" ] }, { @@ -108,11 +1551,79 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# Your response here\n", + "\n", + "## I believe the decision tree model after the balancing is the best alternative. The accuracy score is high\n", + "## and we decreased the numbers of false negatives a lot, which is important considering the fraud/no fraud scenario." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall 0.8827160493827161\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.94 0.97 249676\n", + " 1 0.02 0.88 0.03 324\n", + "\n", + " accuracy 0.94 250000\n", + " macro avg 0.51 0.91 0.50 250000\n", + "weighted avg 1.00 0.94 0.97 250000\n", + "\n" + ] + } + ], + "source": [ + "## Using other metrics for evaluation\n", + "\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import recall_score\n", + "\n", + "## Logistic regression other metrics\n", + "\n", + "pred = log_reg.predict(X_test)\n", + "print(\"recall\", recall_score(y_test, pred))\n", + "print(classification_report(y_test,pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall 0.9290123456790124\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.99 1.00 249676\n", + " 1 0.13 0.93 0.23 324\n", + "\n", + " accuracy 0.99 250000\n", + " macro avg 0.56 0.96 0.61 250000\n", + "weighted avg 1.00 0.99 0.99 250000\n", + "\n" + ] + } + ], + "source": [ + "## Decision tree other metrics\n", + "\n", + "pred = tree.predict(X_test)\n", + "print(\"recall\", recall_score(y_test, pred))\n", + "print(classification_report(y_test,pred))" ] }, { @@ -125,7 +1636,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -139,7 +1650,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" } }, "nbformat": 4,