diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..5bf1597 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -28,11 +28,329 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data=pd.read_csv(\"data.csv\").sample(100000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5773241 | \n",
+ " 400 | \n",
+ " CASH_OUT | \n",
+ " 260792.12 | \n",
+ " C4980831 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " C1622414159 | \n",
+ " 577599.65 | \n",
+ " 838391.77 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4387242 | \n",
+ " 313 | \n",
+ " PAYMENT | \n",
+ " 4481.00 | \n",
+ " C1655611668 | \n",
+ " 34873.0 | \n",
+ " 30392.00 | \n",
+ " M1131146678 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6200936 | \n",
+ " 576 | \n",
+ " PAYMENT | \n",
+ " 10602.93 | \n",
+ " C2094686580 | \n",
+ " 168531.0 | \n",
+ " 157928.07 | \n",
+ " M40193656 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1278927 | \n",
+ " 135 | \n",
+ " CASH_IN | \n",
+ " 204902.45 | \n",
+ " C553935135 | \n",
+ " 0.0 | \n",
+ " 204902.45 | \n",
+ " C285926907 | \n",
+ " 427115.47 | \n",
+ " 222213.01 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1901227 | \n",
+ " 165 | \n",
+ " PAYMENT | \n",
+ " 11206.54 | \n",
+ " C659632965 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " M1675465918 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg \\\n",
+ "5773241 400 CASH_OUT 260792.12 C4980831 0.0 \n",
+ "4387242 313 PAYMENT 4481.00 C1655611668 34873.0 \n",
+ "6200936 576 PAYMENT 10602.93 C2094686580 168531.0 \n",
+ "1278927 135 CASH_IN 204902.45 C553935135 0.0 \n",
+ "1901227 165 PAYMENT 11206.54 C659632965 0.0 \n",
+ "\n",
+ " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
+ "5773241 0.00 C1622414159 577599.65 838391.77 0 \n",
+ "4387242 30392.00 M1131146678 0.00 0.00 0 \n",
+ "6200936 157928.07 M40193656 0.00 0.00 0 \n",
+ "1278927 204902.45 C285926907 427115.47 222213.01 0 \n",
+ "1901227 0.00 M1675465918 0.00 0.00 0 \n",
+ "\n",
+ " isFlaggedFraud \n",
+ "5773241 0 \n",
+ "4387242 0 \n",
+ "6200936 0 \n",
+ "1278927 0 \n",
+ "1901227 0 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 100000.000000 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 100000.000000 | \n",
+ " 100000.0 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 243.518920 | \n",
+ " 1.790638e+05 | \n",
+ " 8.215886e+05 | \n",
+ " 8.430369e+05 | \n",
+ " 1.110068e+06 | \n",
+ " 1.234458e+06 | \n",
+ " 0.001270 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 142.161476 | \n",
+ " 5.732507e+05 | \n",
+ " 2.855118e+06 | \n",
+ " 2.891495e+06 | \n",
+ " 3.495241e+06 | \n",
+ " 3.739747e+06 | \n",
+ " 0.035615 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.000000 | \n",
+ " 2.100000e-01 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 156.000000 | \n",
+ " 1.326014e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 238.000000 | \n",
+ " 7.474836e+04 | \n",
+ " 1.419874e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 1.322503e+05 | \n",
+ " 2.118338e+05 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 334.000000 | \n",
+ " 2.087249e+05 | \n",
+ " 1.073190e+05 | \n",
+ " 1.442655e+05 | \n",
+ " 9.408718e+05 | \n",
+ " 1.108515e+06 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 738.000000 | \n",
+ " 4.669816e+07 | \n",
+ " 3.416935e+07 | \n",
+ " 3.421531e+07 | \n",
+ " 2.059810e+08 | \n",
+ " 2.105993e+08 | \n",
+ " 1.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig \\\n",
+ "count 100000.000000 1.000000e+05 1.000000e+05 1.000000e+05 \n",
+ "mean 243.518920 1.790638e+05 8.215886e+05 8.430369e+05 \n",
+ "std 142.161476 5.732507e+05 2.855118e+06 2.891495e+06 \n",
+ "min 1.000000 2.100000e-01 0.000000e+00 0.000000e+00 \n",
+ "25% 156.000000 1.326014e+04 0.000000e+00 0.000000e+00 \n",
+ "50% 238.000000 7.474836e+04 1.419874e+04 0.000000e+00 \n",
+ "75% 334.000000 2.087249e+05 1.073190e+05 1.442655e+05 \n",
+ "max 738.000000 4.669816e+07 3.416935e+07 3.421531e+07 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "count 1.000000e+05 1.000000e+05 100000.000000 100000.0 \n",
+ "mean 1.110068e+06 1.234458e+06 0.001270 0.0 \n",
+ "std 3.495241e+06 3.739747e+06 0.035615 0.0 \n",
+ "min 0.000000e+00 0.000000e+00 0.000000 0.0 \n",
+ "25% 0.000000e+00 0.000000e+00 0.000000 0.0 \n",
+ "50% 1.322503e+05 2.118338e+05 0.000000 0.0 \n",
+ "75% 9.408718e+05 1.108515e+06 0.000000 0.0 \n",
+ "max 2.059810e+08 2.105993e+08 1.000000 0.0 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.describe()"
]
},
{
@@ -44,11 +362,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 99873\n",
+ "1 127\n",
+ "Name: isFraud, dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAGYCAYAAACu6o3UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjSElEQVR4nO3db3BU5d3/8c+akDVkkmMgZpetuTXOZFLS0BajEwK20AEClpBx2im20R2Z0ogTJY0kRRjbis6YKCAwNRVBbfEPGh/QtE6BNKntoCkEYnStQdAHogmSJViWDcS4ieHcD/xxfl2CiN4LIbner5l9sOd8d/c6jpi3V3YXl23btgAAAAx02XAvAAAAYLgQQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMFT/cC7jUnTp1SocPH1ZycrJcLtdwLwcAAJwH27Z14sQJ+Xw+XXbZF+/7EEJf4vDhw8rIyBjuZQAAgK+hs7NTV1111ReeJ4S+RHJysqTP/0GmpKQM82oAAMD56OnpUUZGhvNz/IsQQl/i9K/DUlJSCCEAAEaYL3tbC2+WBgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLG+cgi9+uqrmj9/vnw+n1wul/785z9HnbdtWytXrpTP51NiYqJmzJihffv2Rc1EIhEtWbJEaWlpSkpKUnFxsQ4dOhQ1EwqF5Pf7ZVmWLMuS3+/X8ePHo2Y6Ojo0f/58JSUlKS0tTeXl5erv74+aefvttzV9+nQlJibqG9/4hh588EHZtv1VLxsAAIxCXzmEent79Z3vfEe1tbVnPb9q1SqtXbtWtbW1am1tldfr1ezZs3XixAlnpqKiQvX19aqrq1Nzc7NOnjypoqIiDQ4OOjMlJSUKBAJqaGhQQ0ODAoGA/H6/c35wcFDz5s1Tb2+vmpubVVdXp61bt6qystKZ6enp0ezZs+Xz+dTa2qrHHntMa9as0dq1a7/qZQMAgNHI/j+QZNfX1zv3T506ZXu9Xvvhhx92jn366ae2ZVn2E088Ydu2bR8/ftweM2aMXVdX58x89NFH9mWXXWY3NDTYtm3b77zzji3JbmlpcWZ2795tS7IPHDhg27Ztb9++3b7sssvsjz76yJl58cUXbbfbbYfDYdu2bfvxxx+3LcuyP/30U2empqbG9vl89qlTp87rGsPhsC3JeU4AAHDpO9+f3zF9j9DBgwcVDAZVWFjoHHO73Zo+fbp27dolSWpra9PAwEDUjM/nU25urjOze/duWZal/Px8Z2bKlCmyLCtqJjc3Vz6fz5mZM2eOIpGI2tranJnp06fL7XZHzRw+fFgffPDBWa8hEomop6cn6gYAAEan+Fg+WTAYlCR5PJ6o4x6PRx9++KEzk5CQoNTU1CEzpx8fDAaVnp4+5PnT09OjZs58ndTUVCUkJETNXHPNNUNe5/S5zMzMIa9RU1OjBx544Lyud7S7Zvm24V4CLqIPHp433EsAgIvugnxqzOVyRd23bXvIsTOdOXO2+VjM2P/vjdJftJ4VK1YoHA47t87OznOuGwAAjFwxDSGv1yvp/+8Mndbd3e3sxHi9XvX39ysUCp1z5siRI0Oe/+jRo1EzZ75OKBTSwMDAOWe6u7slDd21Os3tdislJSXqBgAARqeYhlBmZqa8Xq+ampqcY/39/dq5c6emTp0qScrLy9OYMWOiZrq6utTe3u7MFBQUKBwOa+/evc7Mnj17FA6Ho2ba29vV1dXlzDQ2NsrtdisvL8+ZefXVV6M+Ut/Y2CifzzfkV2YAAMA8XzmETp48qUAgoEAgIOnzN0gHAgF1dHTI5XKpoqJC1dXVqq+vV3t7uxYuXKixY8eqpKREkmRZlhYtWqTKykq98sorevPNN3Xbbbdp0qRJmjVrliRp4sSJmjt3rkpLS9XS0qKWlhaVlpaqqKhI2dnZkqTCwkLl5OTI7/frzTff1CuvvKKqqiqVlpY6uzglJSVyu91auHCh2tvbVV9fr+rqai1duvRLf1UHAABGv6/8ZunXX39dP/jBD5z7S5culSTdfvvt2rx5s5YtW6a+vj6VlZUpFAopPz9fjY2NSk5Odh6zbt06xcfHa8GCBerr69PMmTO1efNmxcXFOTNbtmxReXm58+my4uLiqO8uiouL07Zt21RWVqZp06YpMTFRJSUlWrNmjTNjWZaampp011136frrr1dqaqqWLl3qrBkAAJjNZdt8zfK59PT0yLIshcNh494vxKfGzMKnxgCMJuf785u/awwAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxYh5Cn332mX79618rMzNTiYmJuvbaa/Xggw/q1KlTzoxt21q5cqV8Pp8SExM1Y8YM7du3L+p5IpGIlixZorS0NCUlJam4uFiHDh2KmgmFQvL7/bIsS5Zlye/36/jx41EzHR0dmj9/vpKSkpSWlqby8nL19/fH+rIBAMAIFPMQeuSRR/TEE0+otrZW+/fv16pVq7R69Wo99thjzsyqVau0du1a1dbWqrW1VV6vV7Nnz9aJEyecmYqKCtXX16uurk7Nzc06efKkioqKNDg46MyUlJQoEAiooaFBDQ0NCgQC8vv9zvnBwUHNmzdPvb29am5uVl1dnbZu3arKyspYXzYAABiBXLZt27F8wqKiInk8Hj399NPOsR//+McaO3asnnvuOdm2LZ/Pp4qKCt17772SPt/98Xg8euSRR7R48WKFw2FdeeWVeu6553TLLbdIkg4fPqyMjAxt375dc+bM0f79+5WTk6OWlhbl5+dLklpaWlRQUKADBw4oOztbO3bsUFFRkTo7O+Xz+SRJdXV1Wrhwobq7u5WSkvKl19PT0yPLshQOh89rfjS5Zvm24V4CLqIPHp433EsAgJg535/fMd8RuvHGG/XKK6/ovffekyS99dZbam5u1g9/+ENJ0sGDBxUMBlVYWOg8xu12a/r06dq1a5ckqa2tTQMDA1EzPp9Pubm5zszu3btlWZYTQZI0ZcoUWZYVNZObm+tEkCTNmTNHkUhEbW1tZ11/JBJRT09P1A0AAIxO8bF+wnvvvVfhcFjf/OY3FRcXp8HBQT300EP62c9+JkkKBoOSJI/HE/U4j8ejDz/80JlJSEhQamrqkJnTjw8Gg0pPTx/y+unp6VEzZ75OamqqEhISnJkz1dTU6IEHHviqlw0AAEagmO8IvfTSS3r++ef1wgsv6I033tAzzzyjNWvW6Jlnnomac7lcUfdt2x5y7Exnzpxt/uvM/LcVK1YoHA47t87OznOuCQAAjFwx3xH61a9+peXLl+unP/2pJGnSpEn68MMPVVNTo9tvv11er1fS57s1EyZMcB7X3d3t7N54vV719/crFApF7Qp1d3dr6tSpzsyRI0eGvP7Ro0ejnmfPnj1R50OhkAYGBobsFJ3mdrvldru/7uUDAIARJOY7Qp988okuuyz6aePi4pyPz2dmZsrr9aqpqck539/fr507dzqRk5eXpzFjxkTNdHV1qb293ZkpKChQOBzW3r17nZk9e/YoHA5HzbS3t6urq8uZaWxslNvtVl5eXoyvHAAAjDQx3xGaP3++HnroIf3P//yPvvWtb+nNN9/U2rVr9fOf/1zS57+qqqioUHV1tbKyspSVlaXq6mqNHTtWJSUlkiTLsrRo0SJVVlZq/PjxGjdunKqqqjRp0iTNmjVLkjRx4kTNnTtXpaWl2rhxoyTpjjvuUFFRkbKzsyVJhYWFysnJkd/v1+rVq3Xs2DFVVVWptLTUuE+AAQCAoWIeQo899ph+85vfqKysTN3d3fL5fFq8eLF++9vfOjPLli1TX1+fysrKFAqFlJ+fr8bGRiUnJzsz69atU3x8vBYsWKC+vj7NnDlTmzdvVlxcnDOzZcsWlZeXO58uKy4uVm1trXM+Li5O27ZtU1lZmaZNm6bExESVlJRozZo1sb5sAAAwAsX8e4RGG75HCKbge4QAjCbD9j1CAAAAIwUhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFiEEAAAMBYhBAAAjEUIAQAAYxFCAADAWIQQAAAwFiEEAACMRQgBAABjEUIAAMBYhBAAADAWIQQAAIxFCAEAAGMRQgAAwFgXJIQ++ugj3XbbbRo/frzGjh2r7373u2pra3PO27atlStXyufzKTExUTNmzNC+ffuiniMSiWjJkiVKS0tTUlKSiouLdejQoaiZUCgkv98vy7JkWZb8fr+OHz8eNdPR0aH58+crKSlJaWlpKi8vV39//4W4bAAAMMLEPIRCoZCmTZumMWPGaMeOHXrnnXf06KOP6oorrnBmVq1apbVr16q2tlatra3yer2aPXu2Tpw44cxUVFSovr5edXV1am5u1smTJ1VUVKTBwUFnpqSkRIFAQA0NDWpoaFAgEJDf73fODw4Oat68eert7VVzc7Pq6uq0detWVVZWxvqyAQDACOSybduO5RMuX75c//rXv/Taa6+d9bxt2/L5fKqoqNC9994r6fPdH4/Ho0ceeUSLFy9WOBzWlVdeqeeee0633HKLJOnw4cPKyMjQ9u3bNWfOHO3fv185OTlqaWlRfn6+JKmlpUUFBQU6cOCAsrOztWPHDhUVFamzs1M+n0+SVFdXp4ULF6q7u1spKSlfej09PT2yLEvhcPi85keTa5ZvG+4l4CL64OF5w70EAIiZ8/35HfMdoZdfflnXX3+9fvKTnyg9PV2TJ0/Wk08+6Zw/ePCggsGgCgsLnWNut1vTp0/Xrl27JEltbW0aGBiImvH5fMrNzXVmdu/eLcuynAiSpClTpsiyrKiZ3NxcJ4Ikac6cOYpEIlG/qvtvkUhEPT09UTcAADA6xTyE3n//fW3YsEFZWVn629/+pjvvvFPl5eV69tlnJUnBYFCS5PF4oh7n8Xicc8FgUAkJCUpNTT3nTHp6+pDXT09Pj5o583VSU1OVkJDgzJyppqbGec+RZVnKyMj4qv8IAADACBHzEDp16pSuu+46VVdXa/LkyVq8eLFKS0u1YcOGqDmXyxV137btIcfOdObM2ea/zsx/W7FihcLhsHPr7Ow855oAAMDIFfMQmjBhgnJycqKOTZw4UR0dHZIkr9crSUN2ZLq7u53dG6/Xq/7+foVCoXPOHDlyZMjrHz16NGrmzNcJhUIaGBgYslN0mtvtVkpKStQNAACMTjEPoWnTpundd9+NOvbee+/p6quvliRlZmbK6/WqqanJOd/f36+dO3dq6tSpkqS8vDyNGTMmaqarq0vt7e3OTEFBgcLhsPbu3evM7NmzR+FwOGqmvb1dXV1dzkxjY6Pcbrfy8vJifOUAAGCkiY/1E95zzz2aOnWqqqurtWDBAu3du1ebNm3Spk2bJH3+q6qKigpVV1crKytLWVlZqq6u1tixY1VSUiJJsixLixYtUmVlpcaPH69x48apqqpKkyZN0qxZsyR9vss0d+5clZaWauPGjZKkO+64Q0VFRcrOzpYkFRYWKicnR36/X6tXr9axY8dUVVWl0tJSdnoAAEDsQ+iGG25QfX29VqxYoQcffFCZmZlav369br31Vmdm2bJl6uvrU1lZmUKhkPLz89XY2Kjk5GRnZt26dYqPj9eCBQvU19enmTNnavPmzYqLi3NmtmzZovLycufTZcXFxaqtrXXOx8XFadu2bSorK9O0adOUmJiokpISrVmzJtaXDQAARqCYf4/QaMP3CMEUfI8QgNFk2L5HCAAAYKQghAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGCsCx5CNTU1crlcqqiocI7Ztq2VK1fK5/MpMTFRM2bM0L59+6IeF4lEtGTJEqWlpSkpKUnFxcU6dOhQ1EwoFJLf75dlWbIsS36/X8ePH4+a6ejo0Pz585WUlKS0tDSVl5erv7//Ql0uAAAYQS5oCLW2tmrTpk369re/HXV81apVWrt2rWpra9Xa2iqv16vZs2frxIkTzkxFRYXq6+tVV1en5uZmnTx5UkVFRRocHHRmSkpKFAgE1NDQoIaGBgUCAfn9fuf84OCg5s2bp97eXjU3N6uurk5bt25VZWXlhbxsAAAwQlywEDp58qRuvfVWPfnkk0pNTXWO27at9evX67777tOPfvQj5ebm6plnntEnn3yiF154QZIUDof19NNP69FHH9WsWbM0efJkPf/883r77bf197//XZK0f/9+NTQ06KmnnlJBQYEKCgr05JNP6q9//aveffddSVJjY6PeeecdPf/885o8ebJmzZqlRx99VE8++aR6enou1KUDAIAR4oKF0F133aV58+Zp1qxZUccPHjyoYDCowsJC55jb7db06dO1a9cuSVJbW5sGBgaiZnw+n3Jzc52Z3bt3y7Is5efnOzNTpkyRZVlRM7m5ufL5fM7MnDlzFIlE1NbWdtZ1RyIR9fT0RN0AAMDoFH8hnrSurk5vvPGGWltbh5wLBoOSJI/HE3Xc4/Howw8/dGYSEhKidpJOz5x+fDAYVHp6+pDnT09Pj5o583VSU1OVkJDgzJyppqZGDzzwwPlcJgAAGOFiviPU2dmpX/7yl3r++ed1+eWXf+Gcy+WKum/b9pBjZzpz5mzzX2fmv61YsULhcNi5dXZ2nnNNAABg5Ip5CLW1tam7u1t5eXmKj49XfHy8du7cqd/97neKj493dmjO3JHp7u52znm9XvX39ysUCp1z5siRI0Ne/+jRo1EzZ75OKBTSwMDAkJ2i09xut1JSUqJuAABgdIp5CM2cOVNvv/22AoGAc7v++ut16623KhAI6Nprr5XX61VTU5PzmP7+fu3cuVNTp06VJOXl5WnMmDFRM11dXWpvb3dmCgoKFA6HtXfvXmdmz549CofDUTPt7e3q6upyZhobG+V2u5WXlxfrSwcAACNMzN8jlJycrNzc3KhjSUlJGj9+vHO8oqJC1dXVysrKUlZWlqqrqzV27FiVlJRIkizL0qJFi1RZWanx48dr3Lhxqqqq0qRJk5w3X0+cOFFz585VaWmpNm7cKEm64447VFRUpOzsbElSYWGhcnJy5Pf7tXr1ah07dkxVVVUqLS1lpwcAAFyYN0t/mWXLlqmvr09lZWUKhULKz89XY2OjkpOTnZl169YpPj5eCxYsUF9fn2bOnKnNmzcrLi7OmdmyZYvKy8udT5cVFxertrbWOR8XF6dt27aprKxM06ZNU2JiokpKSrRmzZqLd7EAAOCS5bJt2x7uRVzKenp6ZFmWwuGwcbtI1yzfNtxLwEX0wcPzhnsJABAz5/vzm79rDAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYKyYh1BNTY1uuOEGJScnKz09XTfffLPefffdqBnbtrVy5Ur5fD4lJiZqxowZ2rdvX9RMJBLRkiVLlJaWpqSkJBUXF+vQoUNRM6FQSH6/X5ZlybIs+f1+HT9+PGqmo6ND8+fPV1JSktLS0lReXq7+/v5YXzYAABiBYh5CO3fu1F133aWWlhY1NTXps88+U2FhoXp7e52ZVatWae3ataqtrVVra6u8Xq9mz56tEydOODMVFRWqr69XXV2dmpubdfLkSRUVFWlwcNCZKSkpUSAQUENDgxoaGhQIBOT3+53zg4ODmjdvnnp7e9Xc3Ky6ujpt3bpVlZWVsb5sAAAwArls27Yv5AscPXpU6enp2rlzp77//e/Ltm35fD5VVFTo3nvvlfT57o/H49EjjzyixYsXKxwO68orr9Rzzz2nW265RZJ0+PBhZWRkaPv27ZozZ47279+vnJwctbS0KD8/X5LU0tKigoICHThwQNnZ2dqxY4eKiorU2dkpn88nSaqrq9PChQvV3d2tlJSUL11/T0+PLMtSOBw+r/nR5Jrl24Z7CbiIPnh43nAvAQBi5nx/fl/w9wiFw2FJ0rhx4yRJBw8eVDAYVGFhoTPjdrs1ffp07dq1S5LU1tamgYGBqBmfz6fc3FxnZvfu3bIsy4kgSZoyZYosy4qayc3NdSJIkubMmaNIJKK2trazrjcSiainpyfqBgAARqcLGkK2bWvp0qW68cYblZubK0kKBoOSJI/HEzXr8Xicc8FgUAkJCUpNTT3nTHp6+pDXTE9Pj5o583VSU1OVkJDgzJyppqbGec+RZVnKyMj4qpcNAABGiAsaQnfffbf+/e9/68UXXxxyzuVyRd23bXvIsTOdOXO2+a8z899WrFihcDjs3Do7O8+5JgAAMHJdsBBasmSJXn75Zf3zn//UVVdd5Rz3er2SNGRHpru729m98Xq96u/vVygUOufMkSNHhrzu0aNHo2bOfJ1QKKSBgYEhO0Wnud1upaSkRN0AAMDoFPMQsm1bd999t/70pz/pH//4hzIzM6POZ2Zmyuv1qqmpyTnW39+vnTt3aurUqZKkvLw8jRkzJmqmq6tL7e3tzkxBQYHC4bD27t3rzOzZs0fhcDhqpr29XV1dXc5MY2Oj3G638vLyYn3pAABghImP9RPeddddeuGFF/SXv/xFycnJzo6MZVlKTEyUy+VSRUWFqqurlZWVpaysLFVXV2vs2LEqKSlxZhctWqTKykqNHz9e48aNU1VVlSZNmqRZs2ZJkiZOnKi5c+eqtLRUGzdulCTdcccdKioqUnZ2tiSpsLBQOTk58vv9Wr16tY4dO6aqqiqVlpay0wMAAGIfQhs2bJAkzZgxI+r4H//4Ry1cuFCStGzZMvX19amsrEyhUEj5+flqbGxUcnKyM79u3TrFx8drwYIF6uvr08yZM7V582bFxcU5M1u2bFF5ebnz6bLi4mLV1tY65+Pi4rRt2zaVlZVp2rRpSkxMVElJidasWRPrywYAACPQBf8eoZGO7xGCKfgeIQCjySXzPUIAAACXKkIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsQghAABgLEIIAAAYixACAADGIoQAAICxCCEAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxjIihB5//HFlZmbq8ssvV15enl577bXhXhIAALgEjPoQeumll1RRUaH77rtPb775pr73ve/ppptuUkdHx3AvDQAADLNRH0Jr167VokWL9Itf/EITJ07U+vXrlZGRoQ0bNgz30gAAwDCLH+4FXEj9/f1qa2vT8uXLo44XFhZq165dZ31MJBJRJBJx7ofDYUlST0/PhVvoJepU5JPhXgIuIhP/HQcwep3+b5pt2+ecG9Uh9PHHH2twcFAejyfquMfjUTAYPOtjampq9MADDww5npGRcUHWCFwqrPXDvQIAiL0TJ07IsqwvPD+qQ+g0l8sVdd+27SHHTluxYoWWLl3q3D916pSOHTum8ePHf+FjMHr09PQoIyNDnZ2dSklJGe7lAIgh/nybxbZtnThxQj6f75xzozqE0tLSFBcXN2T3p7u7e8gu0Wlut1tutzvq2BVXXHGhlohLVEpKCv+hBEYp/nyb41w7QaeN6jdLJyQkKC8vT01NTVHHm5qaNHXq1GFaFQAAuFSM6h0hSVq6dKn8fr+uv/56FRQUaNOmTero6NCdd9453EsDAADDbNSH0C233KL//Oc/evDBB9XV1aXc3Fxt375dV1999XAvDZcgt9ut+++/f8ivRwGMfPz5xtm47C/7XBkAAMAoNarfIwQAAHAuhBAAADAWIQQAAIxFCAEAAGMRQgAAwFij/uPzwLkcOnRIGzZs0K5duxQMBuVyueTxeDR16lTdeeed/B1zADDK8fF5GKu5uVk33XSTMjIyVFhYKI/HI9u21d3draamJnV2dmrHjh2aNm3acC8VwAXQ2dmp+++/X3/4wx+GeykYRoQQjHXDDTfoxhtv1Lp16856/p577lFzc7NaW1sv8soAXAxvvfWWrrvuOg0ODg73UjCMCCEYKzExUYFAQNnZ2Wc9f+DAAU2ePFl9fX0XeWUAYuHll18+5/n3339flZWVhJDheI8QjDVhwgTt2rXrC0No9+7dmjBhwkVeFYBYufnmm+VyuXSu/993uVwXcUW4FBFCMFZVVZXuvPNOtbW1afbs2fJ4PHK5XAoGg2pqatJTTz2l9evXD/cyAXxNEyZM0O9//3vdfPPNZz0fCASUl5d3cReFSw4hBGOVlZVp/PjxWrdunTZu3Ohsj8fFxSkvL0/PPvusFixYMMyrBPB15eXl6Y033vjCEPqy3SKYgfcIAZIGBgb08ccfS5LS0tI0ZsyYYV4RgP+r1157Tb29vZo7d+5Zz/f29ur111/X9OnTL/LKcCkhhAAAgLH4ZmkAAGAsQggAABiLEAIAAMYihAAAgLEIIQAAYCxCCAAAGIsQAgAAxiKEAACAsf4XKb9AFHhw3MkAAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "# Your response here"
+ "#assuming that the outcome here is \"isFraud\"\n",
+ "print(data[\"isFraud\"].value_counts())\n",
+ "data[\"isFraud\"].value_counts().plot(kind=\"bar\")\n",
+ "plt.show()\n",
+ "#extemely imbalanced distribution"
]
},
{
@@ -60,11 +402,79 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step 0\n",
+ "type 0\n",
+ "amount 0\n",
+ "nameOrig 0\n",
+ "oldbalanceOrg 0\n",
+ "newbalanceOrig 0\n",
+ "nameDest 0\n",
+ "oldbalanceDest 0\n",
+ "newbalanceDest 0\n",
+ "isFraud 0\n",
+ "isFlaggedFraud 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.isnull().sum() #no nulls needed cleaned"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 100000 entries, 5773241 to 3416958\n",
+ "Data columns (total 11 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 step 100000 non-null int64 \n",
+ " 1 type 100000 non-null object \n",
+ " 2 amount 100000 non-null float64\n",
+ " 3 nameOrig 100000 non-null object \n",
+ " 4 oldbalanceOrg 100000 non-null float64\n",
+ " 5 newbalanceOrig 100000 non-null float64\n",
+ " 6 nameDest 100000 non-null object \n",
+ " 7 oldbalanceDest 100000 non-null float64\n",
+ " 8 newbalanceDest 100000 non-null float64\n",
+ " 9 isFraud 100000 non-null int64 \n",
+ " 10 isFlaggedFraud 100000 non-null int64 \n",
+ "dtypes: float64(5), int64(3), object(3)\n",
+ "memory usage: 9.2+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "data=data.sort_values(by=[\"step\"])\n",
+ "#step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation)\n",
+ "#makes sense to have it as integral... if we are not treating it as time series\n",
+ "# if we were treating this as a time series - closing balance of one day is opening balance of next, then for the test split apply 80/20 split to sorted values"
]
},
{
@@ -76,11 +486,72 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X=data.drop(columns=[\"isFraud\",\"type\",\"nameOrig\",\"nameDest\"])\n",
+ "y=data[\"isFraud\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9977"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "log_reg=LogisticRegression()\n",
+ "log_reg.fit(X_train,y_train)\n",
+ "log_reg.score(X_test,y_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9981"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "log_reg.score(X_train,y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "#model seems to work suprisingly well, BUT let's check while solving imbalance"
]
},
{
@@ -92,11 +563,64 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(79899, 8)\n",
+ "(101, 8)\n",
+ "(79899, 8)\n"
+ ]
+ }
+ ],
+ "source": [
+ "#We are resampling the training test, not the test set. We test the data on imbalanced data\n",
+ "#oversampling\n",
+ "\n",
+ "train = pd.concat([X_train,y_train],axis=1)\n",
+ "\n",
+ "#separate minority from majority class\n",
+ "no_fraud=train[train[\"isFraud\"]==0]\n",
+ "yes_fraud=train[train[\"isFraud\"]==1]\n",
+ "print(no_fraud.shape)\n",
+ "print(yes_fraud.shape)\n",
+ "\n",
+ "from sklearn.utils import resample\n",
+ "fraud_oversampled = resample(yes_fraud, #data points we want to oversample\n",
+ " replace=True, #in oversampling, always True\n",
+ " n_samples = len(no_fraud),\n",
+ " random_state=0) #oversampling till it reaches the size of majority\n",
+ "print(fraud_oversampled.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9831\n",
+ "0.9819271830686241\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "train_oversampled=pd.concat([no_fraud,fraud_oversampled])\n",
+ "\n",
+ "X_train_over = train_oversampled.drop(columns=\"isFraud\")\n",
+ "y_train_over = train_oversampled[\"isFraud\"]\n",
+ "\n",
+ "#now model\n",
+ "log_reg = LogisticRegression()\n",
+ "log_reg.fit(X_train_over,y_train_over)\n",
+ "print(log_reg.score(X_test,y_test))\n",
+ "print(log_reg.score(X_train_over,y_train_over))"
]
},
{
@@ -108,11 +632,11 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "#model works worse after solving for imbalance... great."
]
},
{
@@ -139,7 +663,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.11.4"
}
},
"nbformat": 4,