From 64d44fae0c4bf5dfe5d8d4f79f118074b990f695 Mon Sep 17 00:00:00 2001 From: Jasper Tielmann Date: Sun, 26 Nov 2023 22:46:56 +0000 Subject: [PATCH] lab done --- your-code/lab_imbalance.ipynb | 911 +++++++++++++++++++++++++++++++++- 1 file changed, 896 insertions(+), 15 deletions(-) diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..def53a7 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,368 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pyforest\n", + "import datetime\n", + "from sklearn.utils import resample\n", + "from imblearn.over_sampling import SMOTE\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
4598522328PAYMENT32736.40C1684243534590426.66557690.26M8436108240.000.0000
5077450355CASH_IN300760.89C3346053505558182.235858943.12C1536931569423160.44122399.5500
5563911393TRANSFER772630.81C2871020040.000.00C3595859541074556.051847186.8500
1344831137PAYMENT18602.01C130051231952154.0033551.99M9262872600.000.0000
5080669355CASH_OUT205936.10C10946441921826.000.00C1674575590463633.37669569.4700
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "4598522 328 PAYMENT 32736.40 C1684243534 590426.66 \n", + "5077450 355 CASH_IN 300760.89 C334605350 5558182.23 \n", + "5563911 393 TRANSFER 772630.81 C287102004 0.00 \n", + "1344831 137 PAYMENT 18602.01 C1300512319 52154.00 \n", + "5080669 355 CASH_OUT 205936.10 C109464419 21826.00 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "4598522 557690.26 M843610824 0.00 0.00 0 \n", + "5077450 5858943.12 C1536931569 423160.44 122399.55 0 \n", + "5563911 0.00 C359585954 1074556.05 1847186.85 0 \n", + "1344831 33551.99 M926287260 0.00 0.00 0 \n", + "5080669 0.00 C1674575590 463633.37 669569.47 0 \n", + "\n", + " isFlaggedFraud \n", + "4598522 0 \n", + "5077450 0 \n", + "5563911 0 \n", + "1344831 0 \n", + "5080669 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"PS_20174392719_1491204439457_log.csv\").sample(100000)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
count100000.0000001.000000e+051.000000e+051.000000e+051.000000e+051.000000e+05100000.000000100000.0
mean243.9172701.780445e+058.328977e+058.538896e+051.102595e+061.224264e+060.0013900.0
std142.1171975.922946e+052.897921e+062.934425e+063.438852e+063.664626e+060.0372570.0
min1.0000009.100000e-010.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
25%156.0000001.329020e+040.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
50%240.0000007.361306e+041.376350e+040.000000e+001.260156e+052.074859e+050.0000000.0
75%335.0000002.062294e+051.066145e+051.409539e+059.452993e+051.119223e+060.0000000.0
max742.0000004.574461e+073.361492e+073.371395e+072.001162e+082.052830e+081.0000000.0
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "count 100000.000000 1.000000e+05 1.000000e+05 1.000000e+05 \n", + "mean 243.917270 1.780445e+05 8.328977e+05 8.538896e+05 \n", + "std 142.117197 5.922946e+05 2.897921e+06 2.934425e+06 \n", + "min 1.000000 9.100000e-01 0.000000e+00 0.000000e+00 \n", + "25% 156.000000 1.329020e+04 0.000000e+00 0.000000e+00 \n", + "50% 240.000000 7.361306e+04 1.376350e+04 0.000000e+00 \n", + "75% 335.000000 2.062294e+05 1.066145e+05 1.409539e+05 \n", + "max 742.000000 4.574461e+07 3.361492e+07 3.371395e+07 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "count 1.000000e+05 1.000000e+05 100000.000000 100000.0 \n", + "mean 1.102595e+06 1.224264e+06 0.001390 0.0 \n", + "std 3.438852e+06 3.664626e+06 0.037257 0.0 \n", + "min 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "25% 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "50% 1.260156e+05 2.074859e+05 0.000000 0.0 \n", + "75% 9.452993e+05 1.119223e+06 0.000000 0.0 \n", + "max 2.001162e+08 2.052830e+08 1.000000 0.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" ] }, { @@ -44,11 +401,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import matplotlib.pyplot as plt\\nimport pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAGYCAYAAACu6o3UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjSElEQVR4nO3db3BU5d3/8c+akDVkkmMgZpetuW2cyaSkoS1GJwRsoQMELCHjtFNsozsypREnShpJijC2FZ0xUUBwairinxb/oPEBTesUSJPaDppCIEbWGgR9IDVBsgTLsgkxbmI49wN/nF+XIKL3Qkiu92tmH+w53929jiPm7ZXdxWXbti0AAAADXTbSCwAAABgphBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAY8WP9AIudadOndKRI0eUnJwsl8s10ssBAADnwbZt9fb2yufz6bLLPn/fhxD6AkeOHFFGRsZILwMAAHwFnZ2duuqqqz73PCH0BZKTkyV99g8yJSVlhFcDAADOR09PjzIyMpyf45+HEPoCp38dlpKSQggBADDKfNHbWnizNAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIz1pUPotdde08KFC+Xz+eRyufSnP/0p6rxt21q9erV8Pp8SExM1a9Ys7d+/P2omEolo2bJlSktLU1JSkoqLi3X48OGomVAoJL/fL8uyZFmW/H6/Tpw4ETXT0dGhhQsXKikpSWlpaSovL9fAwEDUzNtvv62ZM2cqMTFRX/va1/TAAw/Itu0ve9kAAGAM+tIh1NfXp29/+9uqra096/k1a9Zo/fr1qq2tVWtrq7xer+bOnave3l5npqKiQvX19aqrq1Nzc7NOnjypoqIiDQ0NOTMlJSUKBAJqaGhQQ0ODAoGA/H6/c35oaEgLFixQX1+fmpubVVdXp61bt6qystKZ6enp0dy5c+Xz+dTa2qrHHntM69at0/r167/sZQMAgLHI/j+QZNfX1zv3T506ZXu9Xvuhhx5yjn3yySe2ZVn2E088Ydu2bZ84ccIeN26cXVdX58x8+OGH9mWXXWY3NDTYtm3b77zzji3JbmlpcWZ2795tS7IPHjxo27Ztb9++3b7sssvsDz/80Jl56aWXbLfbbYfDYdu2bfvxxx+3LcuyP/nkE2empqbG9vl89qlTp87rGsPhsC3JeU4AAHDpO9+f3zF9j9ChQ4cUDAZVWFjoHHO73Zo5c6Z27dolSWpra9Pg4GDUjM/nU25urjOze/duWZal/Px8Z2batGmyLCtqJjc3Vz6fz5mZN2+eIpGI2tranJmZM2fK7XZHzRw5ckT//ve/z3oNkUhEPT09UTcAADA2xcfyyYLBoCTJ4/FEHfd4PPrggw+cmYSEBKWmpg6bOf34YDCo9PT0Yc+fnp4eNXPm66SmpiohISFq5utf//qw1zl9LjMzc9hr1NTU6P777z+v6x3rHtr30UgvARfRyqlpI70EALjoLsinxlwuV9R927aHHTvTmTNnm4/FjP3/3ij9eetZtWqVwuGwc+vs7DznugEAwOgV0xDyer2S/v/O0Gnd3d3OTozX69XAwIBCodA5Z44ePTrs+Y8dOxY1c+brhEIhDQ4OnnOmu7tb0vBdq9PcbrdSUlKibgAAYGyKaQhlZmbK6/WqqanJOTYwMKCdO3dq+vTpkqS8vDyNGzcuaqarq0vt7e3OTEFBgcLhsPbu3evM7NmzR+FwOGqmvb1dXV1dzkxjY6Pcbrfy8vKcmddeey3qI/WNjY3y+XzDfmUGAADM86VD6OTJkwoEAgoEApI+e4N0IBBQR0eHXC6XKioqVF1drfr6erW3t2vx4sUaP368SkpKJEmWZWnJkiWqrKzUq6++qn379unWW2/VlClTNGfOHEnS5MmTNX/+fJWWlqqlpUUtLS0qLS1VUVGRsrOzJUmFhYXKycmR3+/Xvn379Oqrr6qqqkqlpaXOLk5JSYncbrcWL16s9vZ21dfXq7q6WsuXL//CX9UBAICx70u/WfqNN97Q97//fef+8uXLJUm33XabNm/erBUrVqi/v19lZWUKhULKz89XY2OjkpOTncds2LBB8fHxWrRokfr7+zV79mxt3rxZcXFxzsyWLVtUXl7ufLqsuLg46ruL4uLitG3bNpWVlWnGjBlKTExUSUmJ1q1b58xYlqWmpibdeeeduu6665Samqrly5c7awYAAGZz2TZfs3wuPT09sixL4XDYuPcL8akxs/CpMQBjyfn+/ObvGgMAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGCsmIfQp59+ql/96lfKzMxUYmKirrnmGj3wwAM6deqUM2PbtlavXi2fz6fExETNmjVL+/fvj3qeSCSiZcuWKS0tTUlJSSouLtbhw4ejZkKhkPx+vyzLkmVZ8vv9OnHiRNRMR0eHFi5cqKSkJKWlpam8vFwDAwOxvmwAADAKxTyEHn74YT3xxBOqra3VgQMHtGbNGq1du1aPPfaYM7NmzRqtX79etbW1am1tldfr1dy5c9Xb2+vMVFRUqL6+XnV1dWpubtbJkydVVFSkoaEhZ6akpESBQEANDQ1qaGhQIBCQ3+93zg8NDWnBggXq6+tTc3Oz6urqtHXrVlVWVsb6sgEAwCjksm3bjuUTFhUVyePx6JlnnnGO/ehHP9L48eP1/PPPy7Zt+Xw+VVRU6J577pH02e6Px+PRww8/rKVLlyocDuvKK6/U888/r5tvvlmSdOTIEWVkZGj79u2aN2+eDhw4oJycHLW0tCg/P1+S1NLSooKCAh08eFDZ2dnasWOHioqK1NnZKZ/PJ0mqq6vT4sWL1d3drZSUlC+8np6eHlmWpXA4fF7zY8lD+z4a6SXgIlo5NW2klwAAMXO+P79jviN0ww036NVXX9V7770nSXrrrbfU3NysH/zgB5KkQ4cOKRgMqrCw0HmM2+3WzJkztWvXLklSW1ubBgcHo2Z8Pp9yc3Odmd27d8uyLCeCJGnatGmyLCtqJjc314kgSZo3b54ikYja2trOuv5IJKKenp6oGwAAGJviY/2E99xzj8LhsL7xjW8oLi5OQ0NDevDBB/XTn/5UkhQMBiVJHo8n6nEej0cffPCBM5OQkKDU1NRhM6cfHwwGlZ6ePuz109PTo2bOfJ3U1FQlJCQ4M2eqqanR/fff/2UvGwAAjEIx3xF6+eWX9cILL+jFF1/Um2++qWeffVbr1q3Ts88+GzXncrmi7tu2PezYmc6cOdv8V5n5b6tWrVI4HHZunZ2d51wTAAAYvWK+I/TLX/5SK1eu1E9+8hNJ0pQpU/TBBx+opqZGt912m7xer6TPdmsmTZrkPK67u9vZvfF6vRoYGFAoFIraFeru7tb06dOdmaNHjw57/WPHjkU9z549e6LOh0IhDQ4ODtspOs3tdsvtdn/VywcAAKNIzHeEPv74Y112WfTTxsXFOR+fz8zMlNfrVVNTk3N+YGBAO3fudCInLy9P48aNi5rp6upSe3u7M1NQUKBwOKy9e/c6M3v27FE4HI6aaW9vV1dXlzPT2Ngot9utvLy8GF85AAAYbWK+I7Rw4UI9+OCD+p//+R9985vf1L59+7R+/Xr97Gc/k/TZr6oqKipUXV2trKwsZWVlqbq6WuPHj1dJSYkkybIsLVmyRJWVlZo4caImTJigqqoqTZkyRXPmzJEkTZ48WfPnz1dpaak2bdokSbr99ttVVFSk7OxsSVJhYaFycnLk9/u1du1aHT9+XFVVVSotLTXuE2AAAGC4mIfQY489pl//+tcqKytTd3e3fD6fli5dqt/85jfOzIoVK9Tf36+ysjKFQiHl5+ersbFRycnJzsyGDRsUHx+vRYsWqb+/X7Nnz9bmzZsVFxfnzGzZskXl5eXOp8uKi4tVW1vrnI+Li9O2bdtUVlamGTNmKDExUSUlJVq3bl2sLxsAAIxCMf8eobGG7xGCKfgeIQBjyYh9jxAAAMBoQQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAw1gUJoQ8//FC33nqrJk6cqPHjx+s73/mO2tranPO2bWv16tXy+XxKTEzUrFmztH///qjniEQiWrZsmdLS0pSUlKTi4mIdPnw4aiYUCsnv98uyLFmWJb/frxMnTkTNdHR0aOHChUpKSlJaWprKy8s1MDBwIS4bAACMMjEPoVAopBkzZmjcuHHasWOH3nnnHT3yyCO64oornJk1a9Zo/fr1qq2tVWtrq7xer+bOnave3l5npqKiQvX19aqrq1Nzc7NOnjypoqIiDQ0NOTMlJSUKBAJqaGhQQ0ODAoGA/H6/c35oaEgLFixQX1+fmpubVVdXp61bt6qysjLWlw0AAEYhl23bdiyfcOXKlfrnP/+p119//aznbduWz+dTRUWF7rnnHkmf7f54PB49/PDDWrp0qcLhsK688ko9//zzuvnmmyVJR44cUUZGhrZv36558+bpwIEDysnJUUtLi/Lz8yVJLS0tKigo0MGDB5Wdna0dO3aoqKhInZ2d8vl8kqS6ujotXrxY3d3dSklJ+cLr6enpkWVZCofD5zU/ljy076ORXgIuopVT00Z6CQAQM+f78zvmO0KvvPKKrrvuOv34xz9Wenq6pk6dqqeeeso5f+jQIQWDQRUWFjrH3G63Zs6cqV27dkmS2traNDg4GDXj8/mUm5vrzOzevVuWZTkRJEnTpk2TZVlRM7m5uU4ESdK8efMUiUSiflX33yKRiHp6eqJuAABgbIp5CL3//vvauHGjsrKy9Ne//lV33HGHysvL9dxzz0mSgsGgJMnj8UQ9zuPxOOeCwaASEhKUmpp6zpn09PRhr5+enh41c+brpKamKiEhwZk5U01NjfOeI8uylJGR8WX/EQAAgFEi5iF06tQpXXvttaqurtbUqVO1dOlSlZaWauPGjVFzLpcr6r5t28OOnenMmbPNf5WZ/7Zq1SqFw2Hn1tnZec41AQCA0SvmITRp0iTl5OREHZs8ebI6OjokSV6vV5KG7ch0d3c7uzder1cDAwMKhULnnDl69Oiw1z927FjUzJmvEwqFNDg4OGyn6DS3262UlJSoGwAAGJtiHkIzZszQu+++G3Xsvffe09VXXy1JyszMlNfrVVNTk3N+YGBAO3fu1PTp0yVJeXl5GjduXNRMV1eX2tvbnZmCggKFw2Ht3bvXmdmzZ4/C4XDUTHt7u7q6upyZxsZGud1u5eXlxfjKAQDAaBMf6ye8++67NX36dFVXV2vRokXau3evnnzyST355JOSPvtVVUVFhaqrq5WVlaWsrCxVV1dr/PjxKikpkSRZlqUlS5aosrJSEydO1IQJE1RVVaUpU6Zozpw5kj7bZZo/f75KS0u1adMmSdLtt9+uoqIiZWdnS5IKCwuVk5Mjv9+vtWvX6vjx46qqqlJpaSk7PQAAIPYhdP3116u+vl6rVq3SAw88oMzMTD366KO65ZZbnJkVK1aov79fZWVlCoVCys/PV2Njo5KTk52ZDRs2KD4+XosWLVJ/f79mz56tzZs3Ky4uzpnZsmWLysvLnU+XFRcXq7a21jkfFxenbdu2qaysTDNmzFBiYqJKSkq0bt26WF82AAAYhWL+PUJjDd8jBFPwPUIAxpIR+x4hAACA0YIQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEueAjV1NTI5XKpoqLCOWbbtlavXi2fz6fExETNmjVL+/fvj3pcJBLRsmXLlJaWpqSkJBUXF+vw4cNRM6FQSH6/X5ZlybIs+f1+nThxImqmo6NDCxcuVFJSktLS0lReXq6BgYELdbkAAGAUuaAh1NraqieffFLf+ta3oo6vWbNG69evV21trVpbW+X1ejV37lz19vY6MxUVFaqvr1ddXZ2am5t18uRJFRUVaWhoyJkpKSlRIBBQQ0ODGhoaFAgE5Pf7nfNDQ0NasGCB+vr61NzcrLq6Om3dulWVlZUX8rIBAMAoccFC6OTJk7rlllv01FNPKTU11Tlu27YeffRR3XvvvfrhD3+o3NxcPfvss/r444/14osvSpLC4bCeeeYZPfLII5ozZ46mTp2qF154QW+//bb+9re/SZIOHDighoYGPf300yooKFBBQYGeeuop/eUvf9G7774rSWpsbNQ777yjF154QVOnTtWcOXP0yCOP6KmnnlJPT8+FunQAADBKXLAQuvPOO7VgwQLNmTMn6vihQ4cUDAZVWFjoHHO73Zo5c6Z27dolSWpra9Pg4GDUjM/nU25urjOze/duWZal/Px8Z2batGmyLCtqJjc3Vz6fz5mZN2+eIpGI2trazrruSCSinp6eqBsAABib4i/Ek9bV1enNN99Ua2vrsHPBYFCS5PF4oo57PB598MEHzkxCQkLUTtLpmdOPDwaDSk9PH/b86enpUTNnvk5qaqoSEhKcmTPV1NTo/vvvP5/LBAAAo1zMd4Q6Ozv1i1/8Qi+88IIuv/zyz51zuVxR923bHnbsTGfOnG3+q8z8t1WrVikcDju3zs7Oc64JAACMXjEPoba2NnV3dysvL0/x8fGKj4/Xzp079dvf/lbx8fHODs2ZOzLd3d3OOa/Xq4GBAYVCoXPOHD16dNjrHzt2LGrmzNcJhUIaHBwctlN0mtvtVkpKStQNAACMTTEPodmzZ+vtt99WIBBwbtddd51uueUWBQIBXXPNNfJ6vWpqanIeMzAwoJ07d2r69OmSpLy8PI0bNy5qpqurS+3t7c5MQUGBwuGw9u7d68zs2bNH4XA4aqa9vV1dXV3OTGNjo9xut/Ly8mJ96QAAYJSJ+XuEkpOTlZubG3UsKSlJEydOdI5XVFSourpaWVlZysrKUnV1tcaPH6+SkhJJkmVZWrJkiSorKzVx4kRNmDBBVVVVmjJlivPm68mTJ2v+/PkqLS3Vpk2bJEm33367ioqKlJ2dLUkqLCxUTk6O/H6/1q5dq+PHj6uqqkqlpaXs9AAAgAvzZukvsmLFCvX396usrEyhUEj5+flqbGxUcnKyM7NhwwbFx8dr0aJF6u/v1+zZs7V582bFxcU5M1u2bFF5ebnz6bLi4mLV1tY65+Pi4rRt2zaVlZVpxowZSkxMVElJidatW3fxLhYAAFyyXLZt2yO9iEtZT0+PLMtSOBw2bhfpoX0fjfQScBGtnJo20ksAgJg535/f/F1jAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAY8U8hGpqanT99dcrOTlZ6enpuummm/Tuu+9Gzdi2rdWrV8vn8ykxMVGzZs3S/v37o2YikYiWLVumtLQ0JSUlqbi4WIcPH46aCYVC8vv9sixLlmXJ7/frxIkTUTMdHR1auHChkpKSlJaWpvLycg0MDMT6sgEAwCgU8xDauXOn7rzzTrW0tKipqUmffvqpCgsL1dfX58ysWbNG69evV21trVpbW+X1ejV37lz19vY6MxUVFaqvr1ddXZ2am5t18uRJFRUVaWhoyJkpKSlRIBBQQ0ODGhoaFAgE5Pf7nfNDQ0NasGCB+vr61NzcrLq6Om3dulWVlZWxvmwAADAKuWzbti/kCxw7dkzp6enauXOnvve978m2bfl8PlVUVOiee+6R9Nnuj8fj0cMPP6ylS5cqHA7ryiuv1PPPP6+bb75ZknTkyBFlZGRo+/btmjdvng4cOKCcnBy1tLQoPz9fktTS0qKCggIdPHhQ2dnZ2rFjh4qKitTZ2SmfzydJqqur0+LFi9Xd3a2UlJQvXH9PT48sy1I4HD6v+bHkoX0fjfQScBGtnJo20ksAgJg535/fF/w9QuFwWJI0YcIESdKhQ4cUDAZVWFjozLjdbs2cOVO7du2SJLW1tWlwcDBqxufzKTc315nZvXu3LMtyIkiSpk2bJsuyomZyc3OdCJKkefPmKRKJqK2t7azrjUQi6unpiboBAICx6YKGkG3bWr58uW644Qbl5uZKkoLBoCTJ4/FEzXo8HudcMBhUQkKCUlNTzzmTnp4+7DXT09OjZs58ndTUVCUkJDgzZ6qpqXHec2RZljIyMr7sZQMAgFHigobQXXfdpX/961966aWXhp1zuVxR923bHnbsTGfOnG3+q8z8t1WrVikcDju3zs7Oc64JAACMXhcshJYtW6ZXXnlF//jHP3TVVVc5x71eryQN25Hp7u52dm+8Xq8GBgYUCoXOOXP06NFhr3vs2LGomTNfJxQKaXBwcNhO0Wlut1spKSlRNwAAMDbFPIRs29Zdd92lP/7xj/r73/+uzMzMqPOZmZnyer1qampyjg0MDGjnzp2aPn26JCkvL0/jxo2Lmunq6lJ7e7szU1BQoHA4rL179zoze/bsUTgcjpppb29XV1eXM9PY2Ci32628vLxYXzoAABhl4mP9hHfeeadefPFF/fnPf1ZycrKzI2NZlhITE+VyuVRRUaHq6mplZWUpKytL1dXVGj9+vEpKSpzZJUuWqLKyUhMnTtSECRNUVVWlKVOmaM6cOZKkyZMna/78+SotLdWmTZskSbfffruKioqUnZ0tSSosLFROTo78fr/Wrl2r48ePq6qqSqWlpez0AACA2IfQxo0bJUmzZs2KOv6HP/xBixcvliStWLFC/f39KisrUygUUn5+vhobG5WcnOzMb9iwQfHx8Vq0aJH6+/s1e/Zsbd68WXFxcc7Mli1bVF5e7ny6rLi4WLW1tc75uLg4bdu2TWVlZZoxY4YSExNVUlKidevWxfqyAQDAKHTBv0dotON7hGAKvkcIwFhyyXyPEAAAwKWKEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxjAihxx9/XJmZmbr88suVl5en119/faSXBAAALgFjPoRefvllVVRU6N5779W+ffv03e9+VzfeeKM6OjpGemkAAGCEjfkQWr9+vZYsWaKf//znmjx5sh599FFlZGRo48aNI700AAAwwuJHegEX0sDAgNra2rRy5cqo44WFhdq1a9dZHxOJRBSJRJz74XBYktTT03PhFnqJ+uRk70gvARdRT0/CSC8BAGLm9M9t27bPOTemQ+ijjz7S0NCQPB5P1HGPx6NgMHjWx9TU1Oj+++8fdjwjI+OCrBG4VAz/tx4ARr/e3l5ZlvW558d0CJ3mcrmi7tu2PezYaatWrdLy5cud+6dOndLx48c1ceLEz30Mxo6enh5lZGSos7NTKSkpI70cADHEn2+z2Lat3t5e+Xy+c86N6RBKS0tTXFzcsN2f7u7uYbtEp7ndbrnd7qhjV1xxxYVaIi5RKSkp/IcSGKP4822Oc+0EnTam3yydkJCgvLw8NTU1RR1vamrS9OnTR2hVAADgUjGmd4Qkafny5fL7/bruuutUUFCgJ598Uh0dHbrjjjtGemkAAGCEjfkQuvnmm/Wf//xHDzzwgLq6upSbm6vt27fr6quvHuml4RLkdrt13333Dfv1KIDRjz/fOBuX/UWfKwMAABijxvR7hAAAAM6FEAIAAMYihAAAgLEIIQAAYCxCCAAAGGvMf3weOJfDhw9r48aN2rVrl4LBoFwulzwej6ZPn6477riDv2MOAMY4Pj4PYzU3N+vGG29URkaGCgsL5fF4ZNu2uru71dTUpM7OTu3YsUMzZswY6aUCuAA6Ozt133336fe///1ILwUjiBCCsa6//nrdcMMN2rBhw1nP33333WpublZra+tFXhmAi+Gtt97Stddeq6GhoZFeCkYQIQRjJSYmKhAIKDs7+6znDx48qKlTp6q/v/8irwxALLzyyivnPP/++++rsrKSEDIc7xGCsSZNmqRdu3Z9bgjt3r1bkyZNusirAhArN910k1wul871//sul+sirgiXIkIIxqqqqtIdd9yhtrY2zZ07Vx6PRy6XS8FgUE1NTXr66af16KOPjvQyAXxFkyZN0u9+9zvddNNNZz0fCASUl5d3cReFSw4hBGOVlZVp4sSJ2rBhgzZt2uRsj8fFxSkvL0/PPfecFi1aNMKrBPBV5eXl6c033/zcEPqi3SKYgfcIAZIGBwf10UcfSZLS0tI0bty4EV4RgP+r119/XX19fZo/f/5Zz/f19emNN97QzJkzL/LKcCkhhAAAgLH4ZmkAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsf4XyIs+/Hge7xQAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your response here" + "# Your response here\n", + "x = df[\"isFraud\"].value_counts()\n", + "x.plot(kind=\"bar\", color=\"skyblue\")\n", + "plt.show() # 99870 to 130" ] }, { @@ -60,11 +445,245 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import matplotlib.pyplot as plt\\nimport pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import matplotlib.pyplot as plt\\nimport pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code here\n", + "type_dummies = pd.get_dummies(df[\"type\"])\n", + "new_df = pd.concat([df, type_dummies], axis=1)\n", + "new_df.drop([\"type\", \"DEBIT\"],axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'step'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[0;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\_libs\\index.pyx:138\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\_libs\\index.pyx:165\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5745\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5753\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n", + "\u001b[1;31mKeyError\u001b[0m: 'step'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[51], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m new_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m new_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstep\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m30\u001b[39m\n\u001b[0;32m 2\u001b[0m new_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m new_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n", + "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\core\\frame.py:3807\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 3807\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m 3809\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3804\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[0;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m-> 3804\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3805\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3807\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3809\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[1;31mKeyError\u001b[0m: 'step'" + ] + } + ], + "source": [ + "new_df[\"day\"] = new_df[\"step\"]/30\n", + "new_df[\"day\"] = new_df[\"day\"].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "new_df[\"hour\"] = new_df[\"step\"]%24" + ] + }, + { + "cell_type": "code", + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "new_df.drop([\"nameOrig\", \"nameDest\", \"isFlaggedFraud\", \"step\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudCASH_INCASH_OUTPAYMENTTRANSFERdayhour
459852232736.40590426.66557690.260.000.00000101016
5077450300760.895558182.235858943.12423160.44122399.55010001119
5563911772630.810.000.001074556.051847186.8500001139
134483118602.0152154.0033551.990.000.0000010417
5080669205936.1021826.000.00463633.37669569.47001001119
\n", + "
" + ], + "text/plain": [ + " amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n", + "4598522 32736.40 590426.66 557690.26 0.00 \n", + "5077450 300760.89 5558182.23 5858943.12 423160.44 \n", + "5563911 772630.81 0.00 0.00 1074556.05 \n", + "1344831 18602.01 52154.00 33551.99 0.00 \n", + "5080669 205936.10 21826.00 0.00 463633.37 \n", + "\n", + " newbalanceDest isFraud CASH_IN CASH_OUT PAYMENT TRANSFER day \\\n", + "4598522 0.00 0 0 0 1 0 10 \n", + "5077450 122399.55 0 1 0 0 0 11 \n", + "5563911 1847186.85 0 0 0 0 1 13 \n", + "1344831 0.00 0 0 0 1 0 4 \n", + "5080669 669569.47 0 0 1 0 0 11 \n", + "\n", + " hour \n", + "4598522 16 \n", + "5077450 19 \n", + "5563911 9 \n", + "1344831 17 \n", + "5080669 19 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df.head()" ] }, { @@ -76,11 +695,221 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('from sklearn.model_selection import train_test_split\\nimport matplotlib.pyplot as plt\\nimport pandas as pd\\nfrom sklearn.linear_model import LogisticRegression'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code here\n", + "features = new_df.drop(\"isFraud\", axis=1)\n", + "label = new_df[\"isFraud\"]\n", + "X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.20)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('from sklearn.model_selection import train_test_split\\nimport matplotlib.pyplot as plt\\nimport pandas as pd\\nfrom sklearn.linear_model import LogisticRegression'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.99935\n", + "[[19968 6]\n", + " [ 7 19]]\n" + ] + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "log = LogisticRegression()\n", + "log.fit(X_train, y_train)\n", + "\n", + "print(log.score(X_test, y_test))\n", + "pred = log.predict(X_test)\n", + "print(confusion_matrix(y_pred = pred, y_true = y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('from sklearn.model_selection import train_test_split\\nimport matplotlib.pyplot as plt\\nimport pandas as pd\\nfrom sklearn.linear_model import LogisticRegression'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train = pd.concat([X_train, y_train], axis=1)\n", + "\n", + "isnotFraud = train[train[\"isFraud\"]==0]\n", + "isFraud = train[train[\"isFraud\"]==1]" + ] + }, + { + "cell_type": "code", + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "isnotFraud_under = resample(isnotFraud,\n", + " replace=False,\n", + " n_samples = 50000,\n", + " random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('from sklearn.model_selection import train_test_split\\nimport matplotlib.pyplot as plt\\nimport pandas as pd\\nfrom sklearn.linear_model import LogisticRegression'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train_under_sampled = pd.concat([isnotFraud_under, isFraud])" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_under = train_under_sampled.drop(\"isFraud\", axis=1)\n", + "y_train_under = train_under_sampled[\"isFraud\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "sm = SMOTE(random_state=1, sampling_strategy= 1.0)\n", + "X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_under, y_train_under)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 50000\n", + "1 50000\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train_SMOTE.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('from sklearn.model_selection import train_test_split\\nimport matplotlib.pyplot as plt\\nimport pandas as pd\\nfrom sklearn.linear_model import LogisticRegression'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.98305\n", + "[[19635 339]\n", + " [ 0 26]]\n" + ] + } + ], + "source": [ + "log = LogisticRegression()\n", + "log.fit(X_train_SMOTE, y_train_SMOTE)\n", + "\n", + "print(log.score(X_test, y_test))\n", + "pred = log.predict(X_test)\n", + "print(confusion_matrix(y_pred = pred, y_true = y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "## this way we were able to have no more false positives and this way not miss any fraudulent activity anymore!\n", + "## However at the same time we have more hits on false negatives." ] }, { @@ -92,11 +921,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9993\n", + "[[19968 6]\n", + " [ 8 18]]\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "tree = DecisionTreeClassifier()\n", + "\n", + "tree.fit(X_train, y_train)\n", + "\n", + "print(tree.score(X_test, y_test))\n", + "pred = tree.predict(X_test)\n", + "print(confusion_matrix(y_pred = pred, y_true = y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "## first score very similiar to logistic regression. if anything a little bit worse." + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.99765\n", + "[[19932 42]\n", + " [ 5 21]]\n" + ] + } + ], + "source": [ + "tree = DecisionTreeClassifier()\n", + "\n", + "tree.fit(X_train_SMOTE, y_train_SMOTE)\n", + "\n", + "print(tree.score(X_test, y_test))\n", + "pred = tree.predict(X_test)\n", + "print(confusion_matrix(y_pred = pred, y_true = y_test))" ] }, { @@ -108,11 +988,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# the logistic regression worked better as it got no more false positives with the balanced data. WHile the decision tree \n", + "# even though the number of FPs also decreased still fot 5." ] }, { @@ -125,7 +1006,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -139,7 +1020,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.11.3" } }, "nbformat": 4,