diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..5bf1597 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,329 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data=pd.read_csv(\"data.csv\").sample(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
5773241400CASH_OUT260792.12C49808310.00.00C1622414159577599.65838391.7700
4387242313PAYMENT4481.00C165561166834873.030392.00M11311466780.000.0000
6200936576PAYMENT10602.93C2094686580168531.0157928.07M401936560.000.0000
1278927135CASH_IN204902.45C5539351350.0204902.45C285926907427115.47222213.0100
1901227165PAYMENT11206.54C6596329650.00.00M16754659180.000.0000
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "5773241 400 CASH_OUT 260792.12 C4980831 0.0 \n", + "4387242 313 PAYMENT 4481.00 C1655611668 34873.0 \n", + "6200936 576 PAYMENT 10602.93 C2094686580 168531.0 \n", + "1278927 135 CASH_IN 204902.45 C553935135 0.0 \n", + "1901227 165 PAYMENT 11206.54 C659632965 0.0 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "5773241 0.00 C1622414159 577599.65 838391.77 0 \n", + "4387242 30392.00 M1131146678 0.00 0.00 0 \n", + "6200936 157928.07 M40193656 0.00 0.00 0 \n", + "1278927 204902.45 C285926907 427115.47 222213.01 0 \n", + "1901227 0.00 M1675465918 0.00 0.00 0 \n", + "\n", + " isFlaggedFraud \n", + "5773241 0 \n", + "4387242 0 \n", + "6200936 0 \n", + "1278927 0 \n", + "1901227 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
count100000.0000001.000000e+051.000000e+051.000000e+051.000000e+051.000000e+05100000.000000100000.0
mean243.5189201.790638e+058.215886e+058.430369e+051.110068e+061.234458e+060.0012700.0
std142.1614765.732507e+052.855118e+062.891495e+063.495241e+063.739747e+060.0356150.0
min1.0000002.100000e-010.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
25%156.0000001.326014e+040.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
50%238.0000007.474836e+041.419874e+040.000000e+001.322503e+052.118338e+050.0000000.0
75%334.0000002.087249e+051.073190e+051.442655e+059.408718e+051.108515e+060.0000000.0
max738.0000004.669816e+073.416935e+073.421531e+072.059810e+082.105993e+081.0000000.0
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "count 100000.000000 1.000000e+05 1.000000e+05 1.000000e+05 \n", + "mean 243.518920 1.790638e+05 8.215886e+05 8.430369e+05 \n", + "std 142.161476 5.732507e+05 2.855118e+06 2.891495e+06 \n", + "min 1.000000 2.100000e-01 0.000000e+00 0.000000e+00 \n", + "25% 156.000000 1.326014e+04 0.000000e+00 0.000000e+00 \n", + "50% 238.000000 7.474836e+04 1.419874e+04 0.000000e+00 \n", + "75% 334.000000 2.087249e+05 1.073190e+05 1.442655e+05 \n", + "max 738.000000 4.669816e+07 3.416935e+07 3.421531e+07 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "count 1.000000e+05 1.000000e+05 100000.000000 100000.0 \n", + "mean 1.110068e+06 1.234458e+06 0.001270 0.0 \n", + "std 3.495241e+06 3.739747e+06 0.035615 0.0 \n", + "min 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "25% 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "50% 1.322503e+05 2.118338e+05 0.000000 0.0 \n", + "75% 9.408718e+05 1.108515e+06 0.000000 0.0 \n", + "max 2.059810e+08 2.105993e+08 1.000000 0.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.describe()" ] }, { @@ -44,11 +362,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 99873\n", + "1 127\n", + "Name: isFraud, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAGYCAYAAACu6o3UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjSElEQVR4nO3db3BU5d3/8c+akDVkkmMgZpetuTXOZFLS0BajEwK20AEClpBx2im20R2Z0ogTJY0kRRjbis6YKCAwNRVBbfEPGh/QtE6BNKntoCkEYnStQdAHogmSJViWDcS4ieHcD/xxfl2CiN4LIbner5l9sOd8d/c6jpi3V3YXl23btgAAAAx02XAvAAAAYLgQQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMFT/cC7jUnTp1SocPH1ZycrJcLtdwLwcAAJwH27Z14sQJ+Xw+XXbZF+/7EEJf4vDhw8rIyBjuZQAAgK+hs7NTV1111ReeJ4S+RHJysqTP/0GmpKQM82oAAMD56OnpUUZGhvNz/IsQQl/i9K/DUlJSCCEAAEaYL3tbC2+WBgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLG+cgi9+uqrmj9/vnw+n1wul/785z9HnbdtWytXrpTP51NiYqJmzJihffv2Rc1EIhEtWbJEaWlpSkpKUnFxsQ4dOhQ1EwqF5Pf7ZVmWLMuS3+/X8ePHo2Y6Ojo0f/58JSUlKS0tTeXl5erv74+aefvttzV9+nQlJibqG9/4hh588EHZtv1VLxsAAIxCXzmEent79Z3vfEe1tbVnPb9q1SqtXbtWtbW1am1tldfr1ezZs3XixAlnpqKiQvX19aqrq1Nzc7NOnjypoqIiDQ4OOjMlJSUKBAJqaGhQQ0ODAoGA/H6/c35wcFDz5s1Tb2+vmpubVVdXp61bt6qystKZ6enp0ezZs+Xz+dTa2qrHHntMa9as0dq1a7/qZQMAgNHI/j+QZNfX1zv3T506ZXu9Xvvhhx92jn366ae2ZVn2E088Ydu2bR8/ftweM2aMXVdX58x89NFH9mWXXWY3NDTYtm3b77zzji3JbmlpcWZ2795tS7IPHDhg27Ztb9++3b7sssvsjz76yJl58cUXbbfbbYfDYdu2bfvxxx+3LcuyP/30U2empqbG9vl89qlTp87rGsPhsC3JeU4AAHDpO9+f3zF9j9DBgwcVDAZVWFjoHHO73Zo+fbp27dolSWpra9PAwEDUjM/nU25urjOze/duWZal/Px8Z2bKlCmyLCtqJjc3Vz6fz5mZM2eOIpGI2tranJnp06fL7XZHzRw+fFgffPDBWa8hEomop6cn6gYAAEan+Fg+WTAYlCR5PJ6o4x6PRx9++KEzk5CQoNTU1CEzpx8fDAaVnp4+5PnT09OjZs58ndTUVCUkJETNXHPNNUNe5/S5zMzMIa9RU1OjBx544Lyud7S7Zvm24V4CLqIPHp433EsAgIvugnxqzOVyRd23bXvIsTOdOXO2+VjM2P/vjdJftJ4VK1YoHA47t87OznOuGwAAjFwxDSGv1yvp/+8Mndbd3e3sxHi9XvX39ysUCp1z5siRI0Oe/+jRo1EzZ75OKBTSwMDAOWe6u7slDd21Os3tdislJSXqBgAARqeYhlBmZqa8Xq+ampqcY/39/dq5c6emTp0qScrLy9OYMWOiZrq6utTe3u7MFBQUKBwOa+/evc7Mnj17FA6Ho2ba29vV1dXlzDQ2NsrtdisvL8+ZefXVV6M+Ut/Y2CifzzfkV2YAAMA8XzmETp48qUAgoEAgIOnzN0gHAgF1dHTI5XKpoqJC1dXVqq+vV3t7uxYuXKixY8eqpKREkmRZlhYtWqTKykq98sorevPNN3Xbbbdp0qRJmjVrliRp4sSJmjt3rkpLS9XS0qKWlhaVlpaqqKhI2dnZkqTCwkLl5OTI7/frzTff1CuvvKKqqiqVlpY6uzglJSVyu91auHCh2tvbVV9fr+rqai1duvRLf1UHAABGv6/8ZunXX39dP/jBD5z7S5culSTdfvvt2rx5s5YtW6a+vj6VlZUpFAopPz9fjY2NSk5Odh6zbt06xcfHa8GCBerr69PMmTO1efNmxcXFOTNbtmxReXm58+my4uLiqO8uiouL07Zt21RWVqZp06YpMTFRJSUlWrNmjTNjWZaampp011136frrr1dqaqqWLl3qrBkAAJjNZdt8zfK59PT0yLIshcNh494vxKfGzMKnxgCMJuf785u/awwAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxYh5Cn332mX79618rMzNTiYmJuvbaa/Xggw/q1KlTzoxt21q5cqV8Pp8SExM1Y8YM7du3L+p5IpGIlixZorS0NCUlJam4uFiHDh2KmgmFQvL7/bIsS5Zlye/36/jx41EzHR0dmj9/vpKSkpSWlqby8nL19/fH+rIBAMAIFPMQeuSRR/TEE0+otrZW+/fv16pVq7R69Wo99thjzsyqVau0du1a1dbWqrW1VV6vV7Nnz9aJEyecmYqKCtXX16uurk7Nzc06efKkioqKNDg46MyUlJQoEAiooaFBDQ0NCgQC8vv9zvnBwUHNmzdPvb29am5uVl1dnbZu3arKyspYXzYAABiBXLZt27F8wqKiInk8Hj399NPOsR//+McaO3asnnvuOdm2LZ/Pp4qKCt17772SPt/98Xg8euSRR7R48WKFw2FdeeWVeu6553TLLbdIkg4fPqyMjAxt375dc+bM0f79+5WTk6OWlhbl5+dLklpaWlRQUKADBw4oOztbO3bsUFFRkTo7O+Xz+SRJdXV1Wrhwobq7u5WSkvKl19PT0yPLshQOh89rfjS5Zvm24V4CLqIPHp433EsAgJg535/fMd8RuvHGG/XKK6/ovffekyS99dZbam5u1g9/+ENJ0sGDBxUMBlVYWOg8xu12a/r06dq1a5ckqa2tTQMDA1EzPp9Pubm5zszu3btlWZYTQZI0ZcoUWZYVNZObm+tEkCTNmTNHkUhEbW1tZ11/JBJRT09P1A0AAIxO8bF+wnvvvVfhcFjf/OY3FRcXp8HBQT300EP62c9+JkkKBoOSJI/HE/U4j8ejDz/80JlJSEhQamrqkJnTjw8Gg0pPTx/y+unp6VEzZ75OamqqEhISnJkz1dTU6IEHHviqlw0AAEagmO8IvfTSS3r++ef1wgsv6I033tAzzzyjNWvW6Jlnnomac7lcUfdt2x5y7Exnzpxt/uvM/LcVK1YoHA47t87OznOuCQAAjFwx3xH61a9+peXLl+unP/2pJGnSpEn68MMPVVNTo9tvv11er1fS57s1EyZMcB7X3d3t7N54vV719/crFApF7Qp1d3dr6tSpzsyRI0eGvP7Ro0ejnmfPnj1R50OhkAYGBobsFJ3mdrvldru/7uUDAIARJOY7Qp988okuuyz6aePi4pyPz2dmZsrr9aqpqck539/fr507dzqRk5eXpzFjxkTNdHV1qb293ZkpKChQOBzW3r17nZk9e/YoHA5HzbS3t6urq8uZaWxslNvtVl5eXoyvHAAAjDQx3xGaP3++HnroIf3P//yPvvWtb+nNN9/U2rVr9fOf/1zS57+qqqioUHV1tbKyspSVlaXq6mqNHTtWJSUlkiTLsrRo0SJVVlZq/PjxGjdunKqqqjRp0iTNmjVLkjRx4kTNnTtXpaWl2rhxoyTpjjvuUFFRkbKzsyVJhYWFysnJkd/v1+rVq3Xs2DFVVVWptLTUuE+AAQCAoWIeQo899ph+85vfqKysTN3d3fL5fFq8eLF++9vfOjPLli1TX1+fysrKFAqFlJ+fr8bGRiUnJzsz69atU3x8vBYsWKC+vj7NnDlTmzdvVlxcnDOzZcsWlZeXO58uKy4uVm1trXM+Li5O27ZtU1lZmaZNm6bExESVlJRozZo1sb5sAAAwAsX8e4RGG75HCKbge4QAjCbD9j1CAAAAIwUhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFgXJIQ++ugj3XbbbRo/frzGjh2r7373u2pra3PO27atlStXyufzKTExUTNmzNC+ffuiniMSiWjJkiVKS0tTUlKSiouLdejQoaiZUCgkv98vy7JkWZb8fr+OHz8eNdPR0aH58+crKSlJaWlpKi8vV39//4W4bAAAMMLEPIRCoZCmTZumMWPGaMeOHXrnnXf06KOP6oorrnBmVq1apbVr16q2tlatra3yer2aPXu2Tpw44cxUVFSovr5edXV1am5u1smTJ1VUVKTBwUFnpqSkRIFAQA0NDWpoaFAgEJDf73fODw4Oat68eert7VVzc7Pq6uq0detWVVZWxvqyAQDACOSybduO5RMuX75c//rXv/Taa6+d9bxt2/L5fKqoqNC9994r6fPdH4/Ho0ceeUSLFy9WOBzWlVdeqeeee0633HKLJOnw4cPKyMjQ9u3bNWfOHO3fv185OTlqaWlRfn6+JKmlpUUFBQU6cOCAsrOztWPHDhUVFamzs1M+n0+SVFdXp4ULF6q7u1spKSlfej09PT2yLEvhcPi85keTa5ZvG+4l4CL64OF5w70EAIiZ8/35HfMdoZdfflnXX3+9fvKTnyg9PV2TJ0/Wk08+6Zw/ePCggsGgCgsLnWNut1vTp0/Xrl27JEltbW0aGBiImvH5fMrNzXVmdu/eLcuynAiSpClTpsiyrKiZ3NxcJ4Ikac6cOYpEIlG/qvtvkUhEPT09UTcAADA6xTyE3n//fW3YsEFZWVn629/+pjvvvFPl5eV69tlnJUnBYFCS5PF4oh7n8Xicc8FgUAkJCUpNTT3nTHp6+pDXT09Pj5o583VSU1OVkJDgzJyppqbGec+RZVnKyMj4qv8IAADACBHzEDp16pSuu+46VVdXa/LkyVq8eLFKS0u1YcOGqDmXyxV137btIcfOdObM2ea/zsx/W7FihcLhsHPr7Ow855oAAMDIFfMQmjBhgnJycqKOTZw4UR0dHZIkr9crSUN2ZLq7u53dG6/Xq/7+foVCoXPOHDlyZMjrHz16NGrmzNcJhUIaGBgYslN0mtvtVkpKStQNAACMTjEPoWnTpundd9+NOvbee+/p6quvliRlZmbK6/WqqanJOd/f36+dO3dq6tSpkqS8vDyNGTMmaqarq0vt7e3OTEFBgcLhsPbu3evM7NmzR+FwOGqmvb1dXV1dzkxjY6Pcbrfy8vJifOUAAGCkiY/1E95zzz2aOnWqqqurtWDBAu3du1ebNm3Spk2bJH3+q6qKigpVV1crKytLWVlZqq6u1tixY1VSUiJJsixLixYtUmVlpcaPH69x48apqqpKkyZN0qxZsyR9vss0d+5clZaWauPGjZKkO+64Q0VFRcrOzpYkFRYWKicnR36/X6tXr9axY8dUVVWl0tJSdnoAAEDsQ+iGG25QfX29VqxYoQcffFCZmZlav369br31Vmdm2bJl6uvrU1lZmUKhkPLz89XY2Kjk5GRnZt26dYqPj9eCBQvU19enmTNnavPmzYqLi3NmtmzZovLycufTZcXFxaqtrXXOx8XFadu2bSorK9O0adOUmJiokpISrVmzJtaXDQAARqCYf4/QaMP3CMEUfI8QgNFk2L5HCAAAYKQghAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGCsCx5CNTU1crlcqqiocI7Ztq2VK1fK5/MpMTFRM2bM0L59+6IeF4lEtGTJEqWlpSkpKUnFxcU6dOhQ1EwoFJLf75dlWbIsS36/X8ePH4+a6ejo0Pz585WUlKS0tDSVl5erv7//Ql0uAAAYQS5oCLW2tmrTpk369re/HXV81apVWrt2rWpra9Xa2iqv16vZs2frxIkTzkxFRYXq6+tVV1en5uZmnTx5UkVFRRocHHRmSkpKFAgE1NDQoIaGBgUCAfn9fuf84OCg5s2bp97eXjU3N6uurk5bt25VZWXlhbxsAAAwQlywEDp58qRuvfVWPfnkk0pNTXWO27at9evX67777tOPfvQj5ebm6plnntEnn3yiF154QZIUDof19NNP69FHH9WsWbM0efJkPf/883r77bf197//XZK0f/9+NTQ06KmnnlJBQYEKCgr05JNP6q9//aveffddSVJjY6PeeecdPf/885o8ebJmzZqlRx99VE8++aR6enou1KUDAIAR4oKF0F133aV58+Zp1qxZUccPHjyoYDCowsJC55jb7db06dO1a9cuSVJbW5sGBgaiZnw+n3Jzc52Z3bt3y7Is5efnOzNTpkyRZVlRM7m5ufL5fM7MnDlzFIlE1NbWdtZ1RyIR9fT0RN0AAMDoFH8hnrSurk5vvPGGWltbh5wLBoOSJI/HE3Xc4/Howw8/dGYSEhKidpJOz5x+fDAYVHp6+pDnT09Pj5o583VSU1OVkJDgzJyppqZGDzzwwPlcJgAAGOFiviPU2dmpX/7yl3r++ed1+eWXf+Gcy+WKum/b9pBjZzpz5mzzX2fmv61YsULhcNi5dXZ2nnNNAABg5Ip5CLW1tam7u1t5eXmKj49XfHy8du7cqd/97neKj493dmjO3JHp7u52znm9XvX39ysUCp1z5siRI0Ne/+jRo1EzZ75OKBTSwMDAkJ2i09xut1JSUqJuAABgdIp5CM2cOVNvv/22AoGAc7v++ut16623KhAI6Nprr5XX61VTU5PzmP7+fu3cuVNTp06VJOXl5WnMmDFRM11dXWpvb3dmCgoKFA6HtXfvXmdmz549CofDUTPt7e3q6upyZhobG+V2u5WXlxfrSwcAACNMzN8jlJycrNzc3KhjSUlJGj9+vHO8oqJC1dXVysrKUlZWlqqrqzV27FiVlJRIkizL0qJFi1RZWanx48dr3Lhxqqqq0qRJk5w3X0+cOFFz585VaWmpNm7cKEm64447VFRUpOzsbElSYWGhcnJy5Pf7tXr1ah07dkxVVVUqLS1lpwcAAFyYN0t/mWXLlqmvr09lZWUKhULKz89XY2OjkpOTnZl169YpPj5eCxYsUF9fn2bOnKnNmzcrLi7OmdmyZYvKy8udT5cVFxertrbWOR8XF6dt27aprKxM06ZNU2JiokpKSrRmzZqLd7EAAOCS5bJt2x7uRVzKenp6ZFmWwuGwcbtI1yzfNtxLwEX0wcPzhnsJABAz5/vzm79rDAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYKyYh1BNTY1uuOEGJScnKz09XTfffLPefffdqBnbtrVy5Ur5fD4lJiZqxowZ2rdvX9RMJBLRkiVLlJaWpqSkJBUXF+vQoUNRM6FQSH6/X5ZlybIs+f1+HT9+PGqmo6ND8+fPV1JSktLS0lReXq7+/v5YXzYAABiBYh5CO3fu1F133aWWlhY1NTXps88+U2FhoXp7e52ZVatWae3ataqtrVVra6u8Xq9mz56tEydOODMVFRWqr69XXV2dmpubdfLkSRUVFWlwcNCZKSkpUSAQUENDgxoaGhQIBOT3+53zg4ODmjdvnnp7e9Xc3Ky6ujpt3bpVlZWVsb5sAAAwArls27Yv5AscPXpU6enp2rlzp77//e/Ltm35fD5VVFTo3nvvlfT57o/H49EjjzyixYsXKxwO68orr9Rzzz2nW265RZJ0+PBhZWRkaPv27ZozZ47279+vnJwctbS0KD8/X5LU0tKigoICHThwQNnZ2dqxY4eKiorU2dkpn88nSaqrq9PChQvV3d2tlJSUL11/T0+PLMtSOBw+r/nR5Jrl24Z7CbiIPnh43nAvAQBi5nx/fl/w9wiFw2FJ0rhx4yRJBw8eVDAYVGFhoTPjdrs1ffp07dq1S5LU1tamgYGBqBmfz6fc3FxnZvfu3bIsy4kgSZoyZYosy4qayc3NdSJIkubMmaNIJKK2trazrjcSiainpyfqBgAARqcLGkK2bWvp0qW68cYblZubK0kKBoOSJI/HEzXr8Xicc8FgUAkJCUpNTT3nTHp6+pDXTE9Pj5o583VSU1OVkJDgzJyppqbGec+RZVnKyMj4qpcNAABGiAsaQnfffbf+/e9/68UXXxxyzuVyRd23bXvIsTOdOXO2+a8z899WrFihcDjs3Do7O8+5JgAAMHJdsBBasmSJXn75Zf3zn//UVVdd5Rz3er2SNGRHpru729m98Xq96u/vVygUOufMkSNHhrzu0aNHo2bOfJ1QKKSBgYEhO0Wnud1upaSkRN0AAMDoFPMQsm1bd999t/70pz/pH//4hzIzM6POZ2Zmyuv1qqmpyTnW39+vnTt3aurUqZKkvLw8jRkzJmqmq6tL7e3tzkxBQYHC4bD27t3rzOzZs0fhcDhqpr29XV1dXc5MY2Oj3G638vLyYn3pAABghImP9RPeddddeuGFF/SXv/xFycnJzo6MZVlKTEyUy+VSRUWFqqurlZWVpaysLFVXV2vs2LEqKSlxZhctWqTKykqNHz9e48aNU1VVlSZNmqRZs2ZJkiZOnKi5c+eqtLRUGzdulCTdcccdKioqUnZ2tiSpsLBQOTk58vv9Wr16tY4dO6aqqiqVlpay0wMAAGIfQhs2bJAkzZgxI+r4H//4Ry1cuFCStGzZMvX19amsrEyhUEj5+flqbGxUcnKyM79u3TrFx8drwYIF6uvr08yZM7V582bFxcU5M1u2bFF5ebnz6bLi4mLV1tY65+Pi4rRt2zaVlZVp2rRpSkxMVElJidasWRPrywYAACPQBf8eoZGO7xGCKfgeIQCjySXzPUIAAACXKkIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxjIihB5//HFlZmbq8ssvV15enl577bXhXhIAALgEjPoQeumll1RRUaH77rtPb775pr73ve/ppptuUkdHx3AvDQAADLNRH0Jr167VokWL9Itf/EITJ07U+vXrlZGRoQ0bNgz30gAAwDCLH+4FXEj9/f1qa2vT8uXLo44XFhZq165dZ31MJBJRJBJx7ofDYUlST0/PhVvoJepU5JPhXgIuIhP/HQcwep3+b5pt2+ecG9Uh9PHHH2twcFAejyfquMfjUTAYPOtjampq9MADDww5npGRcUHWCFwqrPXDvQIAiL0TJ07IsqwvPD+qQ+g0l8sVdd+27SHHTluxYoWWLl3q3D916pSOHTum8ePHf+FjMHr09PQoIyNDnZ2dSklJGe7lAIgh/nybxbZtnThxQj6f75xzozqE0tLSFBcXN2T3p7u7e8gu0Wlut1tutzvq2BVXXHGhlohLVEpKCv+hBEYp/nyb41w7QaeN6jdLJyQkKC8vT01NTVHHm5qaNHXq1GFaFQAAuFSM6h0hSVq6dKn8fr+uv/56FRQUaNOmTero6NCdd9453EsDAADDbNSH0C233KL//Oc/evDBB9XV1aXc3Fxt375dV1999XAvDZcgt9ut+++/f8ivRwGMfPz5xtm47C/7XBkAAMAoNarfIwQAAHAuhBAAADAWIQQAAIxFCAEAAGMRQgAAwFij/uPzwLkcOnRIGzZs0K5duxQMBuVyueTxeDR16lTdeeed/B1zADDK8fF5GKu5uVk33XSTMjIyVFhYKI/HI9u21d3draamJnV2dmrHjh2aNm3acC8VwAXQ2dmp+++/X3/4wx+GeykYRoQQjHXDDTfoxhtv1Lp16856/p577lFzc7NaW1sv8soAXAxvvfWWrrvuOg0ODg73UjCMCCEYKzExUYFAQNnZ2Wc9f+DAAU2ePFl9fX0XeWUAYuHll18+5/n3339flZWVhJDheI8QjDVhwgTt2rXrC0No9+7dmjBhwkVeFYBYufnmm+VyuXSu/993uVwXcUW4FBFCMFZVVZXuvPNOtbW1afbs2fJ4PHK5XAoGg2pqatJTTz2l9evXD/cyAXxNEyZM0O9//3vdfPPNZz0fCASUl5d3cReFSw4hBGOVlZVp/PjxWrdunTZu3Ohsj8fFxSkvL0/PPvusFixYMMyrBPB15eXl6Y033vjCEPqy3SKYgfcIAZIGBgb08ccfS5LS0tI0ZsyYYV4RgP+r1157Tb29vZo7d+5Zz/f29ur111/X9OnTL/LKcCkhhAAAgLH4ZmkAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsf4XKb9AFHhw3MkAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your response here" + "#assuming that the outcome here is \"isFraud\"\n", + "print(data[\"isFraud\"].value_counts())\n", + "data[\"isFraud\"].value_counts().plot(kind=\"bar\")\n", + "plt.show()\n", + "#extemely imbalanced distribution" ] }, { @@ -60,11 +402,79 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step 0\n", + "type 0\n", + "amount 0\n", + "nameOrig 0\n", + "oldbalanceOrg 0\n", + "newbalanceOrig 0\n", + "nameDest 0\n", + "oldbalanceDest 0\n", + "newbalanceDest 0\n", + "isFraud 0\n", + "isFlaggedFraud 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum() #no nulls needed cleaned" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 100000 entries, 5773241 to 3416958\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 step 100000 non-null int64 \n", + " 1 type 100000 non-null object \n", + " 2 amount 100000 non-null float64\n", + " 3 nameOrig 100000 non-null object \n", + " 4 oldbalanceOrg 100000 non-null float64\n", + " 5 newbalanceOrig 100000 non-null float64\n", + " 6 nameDest 100000 non-null object \n", + " 7 oldbalanceDest 100000 non-null float64\n", + " 8 newbalanceDest 100000 non-null float64\n", + " 9 isFraud 100000 non-null int64 \n", + " 10 isFlaggedFraud 100000 non-null int64 \n", + "dtypes: float64(5), int64(3), object(3)\n", + "memory usage: 9.2+ MB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "data=data.sort_values(by=[\"step\"])\n", + "#step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation)\n", + "#makes sense to have it as integral... if we are not treating it as time series\n", + "# if we were treating this as a time series - closing balance of one day is opening balance of next, then for the test split apply 80/20 split to sorted values" ] }, { @@ -76,11 +486,72 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "X=data.drop(columns=[\"isFraud\",\"type\",\"nameOrig\",\"nameDest\"])\n", + "y=data[\"isFraud\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9977" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_reg=LogisticRegression()\n", + "log_reg.fit(X_train,y_train)\n", + "log_reg.score(X_test,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9981" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_reg.score(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "#model seems to work suprisingly well, BUT let's check while solving imbalance" ] }, { @@ -92,11 +563,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(79899, 8)\n", + "(101, 8)\n", + "(79899, 8)\n" + ] + } + ], + "source": [ + "#We are resampling the training test, not the test set. We test the data on imbalanced data\n", + "#oversampling\n", + "\n", + "train = pd.concat([X_train,y_train],axis=1)\n", + "\n", + "#separate minority from majority class\n", + "no_fraud=train[train[\"isFraud\"]==0]\n", + "yes_fraud=train[train[\"isFraud\"]==1]\n", + "print(no_fraud.shape)\n", + "print(yes_fraud.shape)\n", + "\n", + "from sklearn.utils import resample\n", + "fraud_oversampled = resample(yes_fraud, #data points we want to oversample\n", + " replace=True, #in oversampling, always True\n", + " n_samples = len(no_fraud),\n", + " random_state=0) #oversampling till it reaches the size of majority\n", + "print(fraud_oversampled.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9831\n", + "0.9819271830686241\n" + ] + } + ], "source": [ - "# Your code here" + "train_oversampled=pd.concat([no_fraud,fraud_oversampled])\n", + "\n", + "X_train_over = train_oversampled.drop(columns=\"isFraud\")\n", + "y_train_over = train_oversampled[\"isFraud\"]\n", + "\n", + "#now model\n", + "log_reg = LogisticRegression()\n", + "log_reg.fit(X_train_over,y_train_over)\n", + "print(log_reg.score(X_test,y_test))\n", + "print(log_reg.score(X_train_over,y_train_over))" ] }, { @@ -108,11 +632,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "#model works worse after solving for imbalance... great." ] }, { @@ -139,7 +663,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.11.4" } }, "nbformat": 4,