From 6c5f3135689641b0bd33baa009cd0291c34f6bdb Mon Sep 17 00:00:00 2001 From: DanielCarvalho93 Date: Wed, 17 Aug 2022 17:34:25 +0100 Subject: [PATCH 1/2] labdone --- your-code/lab_imbalance.ipynb | 378 ++++++++++++++++++++++++++++++++-- 1 file changed, 360 insertions(+), 18 deletions(-) diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..8bdd6c6 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,118 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
5259140372TRANSFER137349.44C1560961106142014.04664.56C63869276617049.88154399.3200
1942882177PAYMENT35990.52C1144248570.00.00M1483844490.000.0000
3623533274PAYMENT95.95C98773466219248.019152.05M20360895220.000.0000
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "5259140 372 TRANSFER 137349.44 C1560961106 142014.0 \n", + "1942882 177 PAYMENT 35990.52 C114424857 0.0 \n", + "3623533 274 PAYMENT 95.95 C987734662 19248.0 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "5259140 4664.56 C638692766 17049.88 154399.32 0 \n", + "1942882 0.00 M148384449 0.00 0.00 0 \n", + "3623533 19152.05 M2036089522 0.00 0.00 0 \n", + "\n", + " isFlaggedFraud \n", + "5259140 0 \n", + "1942882 0 \n", + "3623533 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "import pandas as pd\n", + "\n", + "data = pd.read_csv(r'C:\\Users\\Daniel Carvalho\\Desktop\\DataAnalysis\\WEEK7\\DAY4\\labs\\lab-imbalance\\PS_20174392719_1491204439457_log.csv').sample(n=100000)\n", + "data.head(3)" ] }, { @@ -44,11 +151,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 99865\n", + "1 135\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['isFraud'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD4CAYAAAAZ1BptAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAARXElEQVR4nO3dfazeZX3H8fdHKooPPElHWMtWFuu2yrKIDdaYOGcNFFwoyZRA5qimsYmgc85sw+2PLj4kkm06SRTXSUcxTmDMjGbCmgYwZMuKHMQhD2OcgUA7kCPlYRvxofrdH/eFu1fP1Z6eu+c+Lef9Su6c6/f9Xb/f77p6Dudzfg/3TaoKSZKm86L5HoAk6dBlSEiSugwJSVKXISFJ6jIkJEldi+Z7AAfbCSecUMuWLZvvYUjSYeWOO+74blUt3rv+gguJZcuWMTExMd/DkKTDSpKHp6t7uUmS1GVISJK6DAlJUpchIUnqMiQkSV37DYkkm5M8keTuodrxSbYneaB9Pa7Vk+SyJJNJ7kpy2tA261r/B5KsG6q/Psm32jaXJcm+jiFJGp+ZnElcCazZq3YJcFNVLQduassAZwHL22sDcDkMfuEDG4E3AKcDG4d+6V8OvHdouzX7OYYkaUz2GxJVdSuwe6/yWmBLa28Bzh2qX1UDO4Bjk5wEnAlsr6rdVfUUsB1Y09YdXVU7avCZ5Vftta/pjiFJGpPZ3pM4saoea+3HgRNbewnw6FC/na22r/rOaer7OsZPSbIhyUSSiampqVlMR5I0nZHfcV1VlWRO/89F+ztGVW0CNgGsXLly1mNZdslXZ7vpSL79ybfPy3ElaX9meybxnXapiPb1iVbfBZw81G9pq+2rvnSa+r6OIUkak9mGxFbg+SeU1gHXD9UvbE85rQKeaZeMtgFnJDmu3bA+A9jW1j2bZFV7qunCvfY13TEkSWOy38tNSb4MvAU4IclOBk8pfRK4Nsl64GHgvNb9BuBsYBJ4DngPQFXtTvIx4PbW76NV9fzN8IsYPEF1FHBje7GPY0iSxmS/IVFVF3RWrZ6mbwEXd/azGdg8TX0COHWa+pPTHUOSND6+41qS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktQ1Ukgk+VCSe5LcneTLSV6a5JQktyWZTHJNkiNb35e05cm2ftnQfj7S6vcnOXOovqbVJpNcMspYJUkHbtYhkWQJ8DvAyqo6FTgCOB+4FPh0Vb0aeApY3zZZDzzV6p9u/Uiyom33WmAN8LkkRyQ5AvgscBawArig9ZUkjcmol5sWAUclWQS8DHgMeCtwXVu/BTi3tde2Zdr61UnS6ldX1fer6iFgEji9vSar6sGq+gFwdesrSRqTWYdEVe0C/gx4hEE4PAPcATxdVXtat53AktZeAjzatt3T+r9quL7XNr36T0myIclEkompqanZTkmStJdRLjcdx+Av+1OAnwVezuBy0dhV1aaqWllVKxcvXjwfQ5CkF6RRLje9DXioqqaq6ofAV4A3Ace2y08AS4Fdrb0LOBmgrT8GeHK4vtc2vbokaUxGCYlHgFVJXtbuLawG7gVuAd7R+qwDrm/trW2Ztv7mqqpWP789/XQKsBz4OnA7sLw9LXUkg5vbW0cYryTpAC3af5fpVdVtSa4DvgHsAe4ENgFfBa5O8vFWu6JtcgXwxSSTwG4Gv/SpqnuSXMsgYPYAF1fVjwCSvB/YxuDJqc1Vdc9sxytJOnCzDgmAqtoIbNyr/CCDJ5P27vs94J2d/XwC+MQ09RuAG0YZoyRp9nzHtSSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldI4VEkmOTXJfk35Lcl+SNSY5Psj3JA+3rca1vklyWZDLJXUlOG9rPutb/gSTrhuqvT/Ktts1lSTLKeCVJB2bUM4nPAP9YVb8E/CpwH3AJcFNVLQduassAZwHL22sDcDlAkuOBjcAbgNOBjc8HS+vz3qHt1ow4XknSAZh1SCQ5BngzcAVAVf2gqp4G1gJbWrctwLmtvRa4qgZ2AMcmOQk4E9heVbur6ilgO7CmrTu6qnZUVQFXDe1LkjQGo5xJnAJMAX+d5M4kX0jycuDEqnqs9XkcOLG1lwCPDm2/s9X2Vd85Tf2nJNmQZCLJxNTU1AhTkiQNGyUkFgGnAZdX1euA/+H/Li0B0M4AaoRjzEhVbaqqlVW1cvHixXN9OElaMEYJiZ3Azqq6rS1fxyA0vtMuFdG+PtHW7wJOHtp+aavtq750mrokaUxmHRJV9TjwaJJfbKXVwL3AVuD5J5TWAde39lbgwvaU0yrgmXZZahtwRpLj2g3rM4Btbd2zSVa1p5ouHNqXJGkMFo24/QeALyU5EngQeA+D4Lk2yXrgYeC81vcG4GxgEniu9aWqdif5GHB76/fRqtrd2hcBVwJHATe2lyRpTEYKiar6JrBymlWrp+lbwMWd/WwGNk9TnwBOHWWMkqTZ8x3XkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkrpFDIskRSe5M8g9t+ZQktyWZTHJNkiNb/SVtebKtXza0j4+0+v1Jzhyqr2m1ySSXjDpWSdKBORhnEh8E7htavhT4dFW9GngKWN/q64GnWv3TrR9JVgDnA68F1gCfa8FzBPBZ4CxgBXBB6ytJGpORQiLJUuDtwBfacoC3Ate1LluAc1t7bVumrV/d+q8Frq6q71fVQ8AkcHp7TVbVg1X1A+Dq1leSNCajnkn8BfAHwI/b8quAp6tqT1veCSxp7SXAowBt/TOt/0/qe23Tq/+UJBuSTCSZmJqaGnFKkqTnzTokkvwG8ERV3XEQxzMrVbWpqlZW1crFixfP93Ak6QVj0Qjbvgk4J8nZwEuBo4HPAMcmWdTOFpYCu1r/XcDJwM4ki4BjgCeH6s8b3qZXlySNwazPJKrqI1W1tKqWMbjxfHNV/RZwC/CO1m0dcH1rb23LtPU3V1W1+vnt6adTgOXA14HbgeXtaakj2zG2zna8kqQDN8qZRM8fAlcn+ThwJ3BFq18BfDHJJLCbwS99quqeJNcC9wJ7gIur6kcASd4PbAOOADZX1T1zMF5JUsdBCYmq+hrwtdZ+kMGTSXv3+R7wzs72nwA+MU39BuCGgzFGSdKB8x3XkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHXNOiSSnJzkliT3JrknyQdb/fgk25M80L4e1+pJclmSySR3JTltaF/rWv8Hkqwbqr8+ybfaNpclySiTlSQdmFHOJPYAH66qFcAq4OIkK4BLgJuqajlwU1sGOAtY3l4bgMthECrARuANwOnAxueDpfV579B2a0YYryTpAM06JKrqsar6Rmv/F3AfsARYC2xp3bYA57b2WuCqGtgBHJvkJOBMYHtV7a6qp4DtwJq27uiq2lFVBVw1tC9J0hgclHsSSZYBrwNuA06sqsfaqseBE1t7CfDo0GY7W21f9Z3T1Kc7/oYkE0kmpqamRpuMJOknRg6JJK8A/g743ap6dnhdOwOoUY+xP1W1qapWVtXKxYsXz/XhJGnBGCkkkryYQUB8qaq+0srfaZeKaF+faPVdwMlDmy9ttX3Vl05TlySNyShPNwW4Arivqj41tGor8PwTSuuA64fqF7annFYBz7TLUtuAM5Ic125YnwFsa+ueTbKqHevCoX1JksZg0Qjbvgn4beBbSb7Zan8EfBK4Nsl64GHgvLbuBuBsYBJ4DngPQFXtTvIx4PbW76NVtbu1LwKuBI4CbmwvSdKYzDokquqfgN77FlZP07+Aizv72gxsnqY+AZw62zFKkkbjO64lSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldh3xIJFmT5P4kk0kume/xSNJCckiHRJIjgM8CZwErgAuSrJjfUUnSwnFIhwRwOjBZVQ9W1Q+Aq4G18zwmSVowFs33APZjCfDo0PJO4A17d0qyAdjQFv87yf2zPN4JwHdnue2s5dJxH/H/mZc5zzPnvDAstDmPOt+fn654qIfEjFTVJmDTqPtJMlFVKw/CkA4bznlhcM4vfHM130P9ctMu4OSh5aWtJkkag0M9JG4Hlic5JcmRwPnA1nkekyQtGIf05aaq2pPk/cA24Ahgc1XdM4eHHPmS1WHIOS8MzvmFb07mm6qai/1Kkl4ADvXLTZKkeWRISJK6FmRI7O+jPpK8JMk1bf1tSZbNwzAPqhnM+feS3JvkriQ3JZn2menDyUw/0iXJbyapJIf145IzmW+S89r3+Z4kfzPuMR5sM/i5/rkktyS5s/1snz0f4zyYkmxO8kSSuzvrk+Sy9m9yV5LTRjpgVS2oF4Mb4P8B/AJwJPCvwIq9+lwEfL61zweume9xj2HOvw68rLXftxDm3Pq9ErgV2AGsnO9xz/H3eDlwJ3BcW/6Z+R73GOa8CXhfa68Avj3f4z4I834zcBpwd2f92cCNQIBVwG2jHG8hnknM5KM+1gJbWvs6YHWSjHGMB9t+51xVt1TVc21xB4P3pBzOZvqRLh8DLgW+N87BzYGZzPe9wGer6imAqnpizGM82GYy5wKObu1jgP8c4/jmRFXdCuzeR5e1wFU1sAM4NslJsz3eQgyJ6T7qY0mvT1XtAZ4BXjWW0c2Nmcx52HoGf4kczvY753YafnJVfXWcA5sjM/kevwZ4TZJ/TrIjyZqxjW5uzGTOfwK8K8lO4AbgA+MZ2rw60P/e9+mQfp+Exi/Ju4CVwK/N91jmUpIXAZ8C3j3PQxmnRQwuOb2FwZnirUl+paqens9BzbELgCur6s+TvBH4YpJTq+rH8z2ww8VCPJOYyUd9/KRPkkUMTlOfHMvo5saMPt4kyduAPwbOqarvj2lsc2V/c34lcCrwtSTfZnDtduthfPN6Jt/jncDWqvphVT0E/DuD0DhczWTO64FrAarqX4CXMvggvBeyg/pxRgsxJGbyUR9bgXWt/Q7g5mp3hA5T+51zktcBf8kgIA73a9WwnzlX1TNVdUJVLauqZQzuw5xTVRPzM9yRzeTn+u8ZnEWQ5AQGl58eHOMYD7aZzPkRYDVAkl9mEBJTYx3l+G0FLmxPOa0Cnqmqx2a7swV3uak6H/WR5KPARFVtBa5gcFo6yeAG0fnzN+LRzXDOfwq8Avjbdo/+kao6Z94GPaIZzvkFY4bz3QackeRe4EfA71fVYXuGPMM5fxj4qyQfYnAT+92H+R98JPkyg7A/od1r2Qi8GKCqPs/g3svZwCTwHPCekY53mP97SZLm0EK83CRJmiFDQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnrfwFcOjdkd3yn2wAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "# Your response here" + "# Your response here\n", + "import matplotlib.pyplot as plt\n", + "plt.hist(data['isFraud'])\n", + "plt.show()" ] }, { @@ -60,11 +205,163 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
step1.0000000.022924-0.011487-0.0115570.0247190.0232710.032341NaN
amount0.0229241.000000-0.005262-0.0090000.3036070.4734700.063983NaN
oldbalanceOrg-0.011487-0.0052621.0000000.9989660.0638260.0391850.005648NaN
newbalanceOrig-0.011557-0.0090000.9989661.0000000.0655410.039345-0.010291NaN
oldbalanceDest0.0247190.3036070.0638260.0655411.0000000.976133-0.007597NaN
newbalanceDest0.0232710.4734700.0391850.0393450.9761331.000000-0.003181NaN
isFraud0.0323410.0639830.005648-0.010291-0.007597-0.0031811.000000NaN
isFlaggedFraudNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "step 1.000000 0.022924 -0.011487 -0.011557 \n", + "amount 0.022924 1.000000 -0.005262 -0.009000 \n", + "oldbalanceOrg -0.011487 -0.005262 1.000000 0.998966 \n", + "newbalanceOrig -0.011557 -0.009000 0.998966 1.000000 \n", + "oldbalanceDest 0.024719 0.303607 0.063826 0.065541 \n", + "newbalanceDest 0.023271 0.473470 0.039185 0.039345 \n", + "isFraud 0.032341 0.063983 0.005648 -0.010291 \n", + "isFlaggedFraud NaN NaN NaN NaN \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "step 0.024719 0.023271 0.032341 NaN \n", + "amount 0.303607 0.473470 0.063983 NaN \n", + "oldbalanceOrg 0.063826 0.039185 0.005648 NaN \n", + "newbalanceOrig 0.065541 0.039345 -0.010291 NaN \n", + "oldbalanceDest 1.000000 0.976133 -0.007597 NaN \n", + "newbalanceDest 0.976133 1.000000 -0.003181 NaN \n", + "isFraud -0.007597 -0.003181 1.000000 NaN \n", + "isFlaggedFraud NaN NaN NaN NaN " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here\n" + "# Your code here\n", + "data.corr()" ] }, { @@ -76,11 +373,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.99868" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "X = pd.DataFrame(data['amount'])\n", + "y = pd.Series(data['isFraud'], name='labels')\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)\n", + "model.score(X_test,y_test)" ] }, { @@ -92,11 +412,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.99764" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "tree = DecisionTreeClassifier(random_state=1)\n", + "tree.fit(X_train,y_train)\n", + "tree.score(X_test,y_test)" ] }, { @@ -112,7 +448,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# Your response here\n", + "#the logistic regression as the decision tree is slightly over fiited" ] }, { @@ -125,7 +462,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.13 ('CLasses')", "language": "python", "name": "python3" }, @@ -139,7 +476,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "fd8103c585cc5071142faf5ebc004de0b2da2b7faa17a9198853f983d8f17421" + } } }, "nbformat": 4, From b45927193bb080814fb4b384def9b9bff1b32076 Mon Sep 17 00:00:00 2001 From: DanielCarvalho93 Date: Thu, 18 Aug 2022 11:36:05 +0100 Subject: [PATCH 2/2] done --- .gitignore | 2 + your-code/lab_imbalance.ipynb | 478 +++++++++++++++++++++++++--------- 2 files changed, 362 insertions(+), 118 deletions(-) diff --git a/.gitignore b/.gitignore index 0b0271f..b463273 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +PS_20174392719_1491204439457_log + # Created by https://www.gitignore.io/api/macos,pycharm,visualstudio,jupyternotebook,visualstudiocode # Edit at https://www.gitignore.io/?templates=macos,pycharm,visualstudio,jupyternotebook,visualstudiocode diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index 8bdd6c6..8c7601e 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -67,44 +67,44 @@ " \n", " \n", " \n", - " 5259140\n", - " 372\n", - " TRANSFER\n", - " 137349.44\n", - " C1560961106\n", - " 142014.0\n", - " 4664.56\n", - " C638692766\n", - " 17049.88\n", - " 154399.32\n", + " 134364\n", + " 11\n", + " CASH_IN\n", + " 63402.09\n", + " C1610152695\n", + " 220.00\n", + " 63622.09\n", + " C2116285782\n", + " 764063.11\n", + " 1397717.72\n", " 0\n", " 0\n", " \n", " \n", - " 1942882\n", - " 177\n", + " 2805349\n", + " 225\n", " PAYMENT\n", - " 35990.52\n", - " C114424857\n", - " 0.0\n", - " 0.00\n", - " M148384449\n", + " 35153.23\n", + " C2146817319\n", + " 149911.95\n", + " 114758.72\n", + " M1426697974\n", " 0.00\n", " 0.00\n", " 0\n", " 0\n", " \n", " \n", - " 3623533\n", - " 274\n", - " PAYMENT\n", - " 95.95\n", - " C987734662\n", - " 19248.0\n", - " 19152.05\n", - " M2036089522\n", - " 0.00\n", - " 0.00\n", + " 2624794\n", + " 208\n", + " CASH_OUT\n", + " 173949.92\n", + " C822459157\n", + " 229678.15\n", + " 55728.23\n", + " C793116596\n", + " 7751776.83\n", + " 7925726.75\n", " 0\n", " 0\n", " \n", @@ -114,22 +114,22 @@ ], "text/plain": [ " step type amount nameOrig oldbalanceOrg \\\n", - "5259140 372 TRANSFER 137349.44 C1560961106 142014.0 \n", - "1942882 177 PAYMENT 35990.52 C114424857 0.0 \n", - "3623533 274 PAYMENT 95.95 C987734662 19248.0 \n", + "134364 11 CASH_IN 63402.09 C1610152695 220.00 \n", + "2805349 225 PAYMENT 35153.23 C2146817319 149911.95 \n", + "2624794 208 CASH_OUT 173949.92 C822459157 229678.15 \n", "\n", " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", - "5259140 4664.56 C638692766 17049.88 154399.32 0 \n", - "1942882 0.00 M148384449 0.00 0.00 0 \n", - "3623533 19152.05 M2036089522 0.00 0.00 0 \n", + "134364 63622.09 C2116285782 764063.11 1397717.72 0 \n", + "2805349 114758.72 M1426697974 0.00 0.00 0 \n", + "2624794 55728.23 C793116596 7751776.83 7925726.75 0 \n", "\n", " isFlaggedFraud \n", - "5259140 0 \n", - "1942882 0 \n", - "3623533 0 " + "134364 0 \n", + "2805349 0 \n", + "2624794 0 " ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -151,18 +151,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 99865\n", - "1 135\n", + "0 99857\n", + "1 143\n", "Name: isFraud, dtype: int64" ] }, - "execution_count": 9, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -173,12 +173,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD4CAYAAAAZ1BptAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAARXElEQVR4nO3dfazeZX3H8fdHKooPPElHWMtWFuu2yrKIDdaYOGcNFFwoyZRA5qimsYmgc85sw+2PLj4kkm06SRTXSUcxTmDMjGbCmgYwZMuKHMQhD2OcgUA7kCPlYRvxofrdH/eFu1fP1Z6eu+c+Lef9Su6c6/f9Xb/f77p6Dudzfg/3TaoKSZKm86L5HoAk6dBlSEiSugwJSVKXISFJ6jIkJEldi+Z7AAfbCSecUMuWLZvvYUjSYeWOO+74blUt3rv+gguJZcuWMTExMd/DkKTDSpKHp6t7uUmS1GVISJK6DAlJUpchIUnqMiQkSV37DYkkm5M8keTuodrxSbYneaB9Pa7Vk+SyJJNJ7kpy2tA261r/B5KsG6q/Psm32jaXJcm+jiFJGp+ZnElcCazZq3YJcFNVLQduassAZwHL22sDcDkMfuEDG4E3AKcDG4d+6V8OvHdouzX7OYYkaUz2GxJVdSuwe6/yWmBLa28Bzh2qX1UDO4Bjk5wEnAlsr6rdVfUUsB1Y09YdXVU7avCZ5Vftta/pjiFJGpPZ3pM4saoea+3HgRNbewnw6FC/na22r/rOaer7OsZPSbIhyUSSiampqVlMR5I0nZHfcV1VlWRO/89F+ztGVW0CNgGsXLly1mNZdslXZ7vpSL79ybfPy3ElaX9meybxnXapiPb1iVbfBZw81G9pq+2rvnSa+r6OIUkak9mGxFbg+SeU1gHXD9UvbE85rQKeaZeMtgFnJDmu3bA+A9jW1j2bZFV7qunCvfY13TEkSWOy38tNSb4MvAU4IclOBk8pfRK4Nsl64GHgvNb9BuBsYBJ4DngPQFXtTvIx4PbW76NV9fzN8IsYPEF1FHBje7GPY0iSxmS/IVFVF3RWrZ6mbwEXd/azGdg8TX0COHWa+pPTHUOSND6+41qS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktQ1Ukgk+VCSe5LcneTLSV6a5JQktyWZTHJNkiNb35e05cm2ftnQfj7S6vcnOXOovqbVJpNcMspYJUkHbtYhkWQJ8DvAyqo6FTgCOB+4FPh0Vb0aeApY3zZZDzzV6p9u/Uiyom33WmAN8LkkRyQ5AvgscBawArig9ZUkjcmol5sWAUclWQS8DHgMeCtwXVu/BTi3tde2Zdr61UnS6ldX1fer6iFgEji9vSar6sGq+gFwdesrSRqTWYdEVe0C/gx4hEE4PAPcATxdVXtat53AktZeAjzatt3T+r9quL7XNr36T0myIclEkompqanZTkmStJdRLjcdx+Av+1OAnwVezuBy0dhV1aaqWllVKxcvXjwfQ5CkF6RRLje9DXioqqaq6ofAV4A3Ace2y08AS4Fdrb0LOBmgrT8GeHK4vtc2vbokaUxGCYlHgFVJXtbuLawG7gVuAd7R+qwDrm/trW2Ztv7mqqpWP789/XQKsBz4OnA7sLw9LXUkg5vbW0cYryTpAC3af5fpVdVtSa4DvgHsAe4ENgFfBa5O8vFWu6JtcgXwxSSTwG4Gv/SpqnuSXMsgYPYAF1fVjwCSvB/YxuDJqc1Vdc9sxytJOnCzDgmAqtoIbNyr/CCDJ5P27vs94J2d/XwC+MQ09RuAG0YZoyRp9nzHtSSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldI4VEkmOTXJfk35Lcl+SNSY5Psj3JA+3rca1vklyWZDLJXUlOG9rPutb/gSTrhuqvT/Ktts1lSTLKeCVJB2bUM4nPAP9YVb8E/CpwH3AJcFNVLQduassAZwHL22sDcDlAkuOBjcAbgNOBjc8HS+vz3qHt1ow4XknSAZh1SCQ5BngzcAVAVf2gqp4G1gJbWrctwLmtvRa4qgZ2AMcmOQk4E9heVbur6ilgO7CmrTu6qnZUVQFXDe1LkjQGo5xJnAJMAX+d5M4kX0jycuDEqnqs9XkcOLG1lwCPDm2/s9X2Vd85Tf2nJNmQZCLJxNTU1AhTkiQNGyUkFgGnAZdX1euA/+H/Li0B0M4AaoRjzEhVbaqqlVW1cvHixXN9OElaMEYJiZ3Azqq6rS1fxyA0vtMuFdG+PtHW7wJOHtp+aavtq750mrokaUxmHRJV9TjwaJJfbKXVwL3AVuD5J5TWAde39lbgwvaU0yrgmXZZahtwRpLj2g3rM4Btbd2zSVa1p5ouHNqXJGkMFo24/QeALyU5EngQeA+D4Lk2yXrgYeC81vcG4GxgEniu9aWqdif5GHB76/fRqtrd2hcBVwJHATe2lyRpTEYKiar6JrBymlWrp+lbwMWd/WwGNk9TnwBOHWWMkqTZ8x3XkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkrpFDIskRSe5M8g9t+ZQktyWZTHJNkiNb/SVtebKtXza0j4+0+v1Jzhyqr2m1ySSXjDpWSdKBORhnEh8E7htavhT4dFW9GngKWN/q64GnWv3TrR9JVgDnA68F1gCfa8FzBPBZ4CxgBXBB6ytJGpORQiLJUuDtwBfacoC3Ate1LluAc1t7bVumrV/d+q8Frq6q71fVQ8AkcHp7TVbVg1X1A+Dq1leSNCajnkn8BfAHwI/b8quAp6tqT1veCSxp7SXAowBt/TOt/0/qe23Tq/+UJBuSTCSZmJqaGnFKkqTnzTokkvwG8ERV3XEQxzMrVbWpqlZW1crFixfP93Ak6QVj0Qjbvgk4J8nZwEuBo4HPAMcmWdTOFpYCu1r/XcDJwM4ki4BjgCeH6s8b3qZXlySNwazPJKrqI1W1tKqWMbjxfHNV/RZwC/CO1m0dcH1rb23LtPU3V1W1+vnt6adTgOXA14HbgeXtaakj2zG2zna8kqQDN8qZRM8fAlcn+ThwJ3BFq18BfDHJJLCbwS99quqeJNcC9wJ7gIur6kcASd4PbAOOADZX1T1zMF5JUsdBCYmq+hrwtdZ+kMGTSXv3+R7wzs72nwA+MU39BuCGgzFGSdKB8x3XkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHXNOiSSnJzkliT3JrknyQdb/fgk25M80L4e1+pJclmSySR3JTltaF/rWv8Hkqwbqr8+ybfaNpclySiTlSQdmFHOJPYAH66qFcAq4OIkK4BLgJuqajlwU1sGOAtY3l4bgMthECrARuANwOnAxueDpfV579B2a0YYryTpAM06JKrqsar6Rmv/F3AfsARYC2xp3bYA57b2WuCqGtgBHJvkJOBMYHtV7a6qp4DtwJq27uiq2lFVBVw1tC9J0hgclHsSSZYBrwNuA06sqsfaqseBE1t7CfDo0GY7W21f9Z3T1Kc7/oYkE0kmpqamRpuMJOknRg6JJK8A/g743ap6dnhdOwOoUY+xP1W1qapWVtXKxYsXz/XhJGnBGCkkkryYQUB8qaq+0srfaZeKaF+faPVdwMlDmy9ttX3Vl05TlySNyShPNwW4Arivqj41tGor8PwTSuuA64fqF7annFYBz7TLUtuAM5Ic125YnwFsa+ueTbKqHevCoX1JksZg0Qjbvgn4beBbSb7Zan8EfBK4Nsl64GHgvLbuBuBsYBJ4DngPQFXtTvIx4PbW76NVtbu1LwKuBI4CbmwvSdKYzDokquqfgN77FlZP07+Aizv72gxsnqY+AZw62zFKkkbjO64lSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldh3xIJFmT5P4kk0kume/xSNJCckiHRJIjgM8CZwErgAuSrJjfUUnSwnFIhwRwOjBZVQ9W1Q+Aq4G18zwmSVowFs33APZjCfDo0PJO4A17d0qyAdjQFv87yf2zPN4JwHdnue2s5dJxH/H/mZc5zzPnvDAstDmPOt+fn654qIfEjFTVJmDTqPtJMlFVKw/CkA4bznlhcM4vfHM130P9ctMu4OSh5aWtJkkag0M9JG4Hlic5JcmRwPnA1nkekyQtGIf05aaq2pPk/cA24Ahgc1XdM4eHHPmS1WHIOS8MzvmFb07mm6qai/1Kkl4ADvXLTZKkeWRISJK6FmRI7O+jPpK8JMk1bf1tSZbNwzAPqhnM+feS3JvkriQ3JZn2menDyUw/0iXJbyapJIf145IzmW+S89r3+Z4kfzPuMR5sM/i5/rkktyS5s/1snz0f4zyYkmxO8kSSuzvrk+Sy9m9yV5LTRjpgVS2oF4Mb4P8B/AJwJPCvwIq9+lwEfL61zweume9xj2HOvw68rLXftxDm3Pq9ErgV2AGsnO9xz/H3eDlwJ3BcW/6Z+R73GOa8CXhfa68Avj3f4z4I834zcBpwd2f92cCNQIBVwG2jHG8hnknM5KM+1gJbWvs6YHWSjHGMB9t+51xVt1TVc21xB4P3pBzOZvqRLh8DLgW+N87BzYGZzPe9wGer6imAqnpizGM82GYy5wKObu1jgP8c4/jmRFXdCuzeR5e1wFU1sAM4NslJsz3eQgyJ6T7qY0mvT1XtAZ4BXjWW0c2Nmcx52HoGf4kczvY753YafnJVfXWcA5sjM/kevwZ4TZJ/TrIjyZqxjW5uzGTOfwK8K8lO4AbgA+MZ2rw60P/e9+mQfp+Exi/Ju4CVwK/N91jmUpIXAZ8C3j3PQxmnRQwuOb2FwZnirUl+paqens9BzbELgCur6s+TvBH4YpJTq+rH8z2ww8VCPJOYyUd9/KRPkkUMTlOfHMvo5saMPt4kyduAPwbOqarvj2lsc2V/c34lcCrwtSTfZnDtduthfPN6Jt/jncDWqvphVT0E/DuD0DhczWTO64FrAarqX4CXMvggvBeyg/pxRgsxJGbyUR9bgXWt/Q7g5mp3hA5T+51zktcBf8kgIA73a9WwnzlX1TNVdUJVLauqZQzuw5xTVRPzM9yRzeTn+u8ZnEWQ5AQGl58eHOMYD7aZzPkRYDVAkl9mEBJTYx3l+G0FLmxPOa0Cnqmqx2a7swV3uak6H/WR5KPARFVtBa5gcFo6yeAG0fnzN+LRzXDOfwq8Avjbdo/+kao6Z94GPaIZzvkFY4bz3QackeRe4EfA71fVYXuGPMM5fxj4qyQfYnAT+92H+R98JPkyg7A/od1r2Qi8GKCqPs/g3svZwCTwHPCekY53mP97SZLm0EK83CRJmiFDQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnrfwFcOjdkd3yn2wAAAABJRU5ErkJggg==", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD1CAYAAAClSgmzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAO20lEQVR4nO3dcaidd33H8fdnyaJVqU3tJdQkXQJmk1gY1tBmCGOY0aZ1LP1DpWWsoQTzh+2mYzDj/gmoBYWxzoIWgs1MRYylExo0NYRoGWO0za2V1rTrcqmrSWjt1cR2m2iNfvfH/WUebu8vae5Jz7km7xcczvN8f7/f83wvhPvp85zn3KaqkCRpLr8z7gYkSQuXISFJ6jIkJEldhoQkqcuQkCR1GRKSpK7F427gXLvssstq1apV425Dkn6rPPbYYz+uqonZ9fMuJFatWsXk5OS425Ck3ypJnpur7u0mSVKXISFJ6jIkJEldhoQkqeuMIZFkZ5IXk3x/oHZpkv1JDrf3pa2eJHclmUryRJKrBtZsbvMPJ9k8UH9PkifbmruS5HTnkCSNzmu5kvgSsHFWbRtwoKrWAAfaPsD1wJr22grcDTO/8IHtwDXA1cD2gV/6dwMfHli38QznkCSNyBlDoqr+FTg+q7wJ2NW2dwE3DtTvrRkPA5ckuRy4DthfVcer6gSwH9jYxi6uqodr5m+W3zvrWHOdQ5I0IvP9TGJZVT3ftl8AlrXt5cCRgXlHW+109aNz1E93DknSiAz9ZbqqqiSv6/+56EznSLKVmdtbXHHFFa9nK+fMqm3fHHcL543/+sz7x92CdN6a75XEj9qtItr7i61+DFg5MG9Fq52uvmKO+unO8SpVtaOq1lXVuomJV32rXJI0T/MNiT3AqSeUNgMPDNRvaU85rQdeareM9gHXJlnaPrC+FtjXxl5Osr491XTLrGPNdQ5J0oic8XZTkq8CfwJcluQoM08pfQa4L8kW4DngQ236XuAGYAr4GXArQFUdT/Ip4GCb98mqOvVh+EeYeYLqIuDB9uI055AkjcgZQ6Kqbu4MbZhjbgG3dY6zE9g5R30SuHKO+k/mOockaXT8xrUkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKlrqJBI8jdJDiX5fpKvJnljktVJHkkyleRrSZa0uW9o+1NtfNXAcT7R6s8kuW6gvrHVppJsG6ZXSdLZm3dIJFkO/DWwrqquBBYBNwGfBe6sqncAJ4AtbckW4ESr39nmkWRtW/cuYCPwhSSLkiwCPg9cD6wFbm5zJUkjMuztpsXARUkWA28CngfeB9zfxncBN7btTW2fNr4hSVp9d1X9oqp+AEwBV7fXVFU9W1WvALvbXEnSiMw7JKrqGPAPwA+ZCYeXgMeAn1bVyTbtKLC8bS8HjrS1J9v8tw3WZ63p1SVJIzLM7aalzPyX/Wrg7cCbmbldNHJJtiaZTDI5PT09jhYk6bw0zO2mPwV+UFXTVfVL4OvAe4FL2u0ngBXAsbZ9DFgJ0MbfCvxksD5rTa/+KlW1o6rWVdW6iYmJIX4kSdKgYULih8D6JG9qny1sAJ4CvgN8oM3ZDDzQtve0fdr4t6uqWv2m9vTTamAN8ChwEFjTnpZawsyH23uG6FeSdJYWn3nK3KrqkST3A98FTgKPAzuAbwK7k3y61e5pS+4BvpxkCjjOzC99qupQkvuYCZiTwG1V9SuAJLcD+5h5cmpnVR2ab7+SpLM375AAqKrtwPZZ5WeZeTJp9tyfAx/sHOcO4I456nuBvcP0KEmaP79xLUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXUCGR5JIk9yf5jyRPJ/mjJJcm2Z/kcHtf2uYmyV1JppI8keSqgeNsbvMPJ9k8UH9PkifbmruSZJh+JUlnZ9gric8B36qqdwJ/CDwNbAMOVNUa4EDbB7geWNNeW4G7AZJcCmwHrgGuBrafCpY258MD6zYO2a8k6SzMOySSvBX4Y+AegKp6pap+CmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkERjmSmI1MA38c5LHk3wxyZuBZVX1fJvzArCsbS8HjgysP9pqp6sfnaMuSRqRYUJiMXAVcHdVvRv4X35zawmAdgVQQ5zjNUmyNclkksnp6enX+3SSdMEYJiSOAker6pG2fz8zofGjdquI9v5iGz8GrBxYv6LVTldfMUf9VapqR1Wtq6p1ExMTQ/xIkqRB8w6JqnoBOJLkD1ppA/AUsAc49YTSZuCBtr0HuKU95bQeeKndltoHXJtkafvA+lpgXxt7Ocn69lTTLQPHkiSNwOIh1/8V8JUkS4BngVuZCZ77kmwBngM+1ObuBW4ApoCftblU1fEknwIOtnmfrKrjbfsjwJeAi4AH20uSNCJDhURVfQ9YN8fQhjnmFnBb5zg7gZ1z1CeBK4fpUZI0f37jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1DV0SCRZlOTxJN9o+6uTPJJkKsnXkixp9Te0/ak2vmrgGJ9o9WeSXDdQ39hqU0m2DdurJOnsnIsriY8CTw/sfxa4s6reAZwAtrT6FuBEq9/Z5pFkLXAT8C5gI/CFFjyLgM8D1wNrgZvbXEnSiAwVEklWAO8Hvtj2A7wPuL9N2QXc2LY3tX3a+IY2fxOwu6p+UVU/AKaAq9trqqqerapXgN1triRpRIa9kvgn4O+AX7f9twE/raqTbf8osLxtLweOALTxl9r8/6/PWtOrv0qSrUkmk0xOT08P+SNJkk6Zd0gk+TPgxap67Bz2My9VtaOq1lXVuomJiXG3I0nnjcVDrH0v8OdJbgDeCFwMfA64JMnidrWwAjjW5h8DVgJHkywG3gr8ZKB+yuCaXl2SNALzvpKoqk9U1YqqWsXMB8/frqq/AL4DfKBN2ww80Lb3tH3a+Lerqlr9pvb002pgDfAocBBY056WWtLOsWe+/UqSzt4wVxI9Hwd2J/k08DhwT6vfA3w5yRRwnJlf+lTVoST3AU8BJ4HbqupXAEluB/YBi4CdVXXodehXktRxTkKiqh4CHmrbzzLzZNLsOT8HPthZfwdwxxz1vcDec9GjJOns+Y1rSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrrmHRJJVib5TpKnkhxK8tFWvzTJ/iSH2/vSVk+Su5JMJXkiyVUDx9rc5h9Osnmg/p4kT7Y1dyXJMD+sJOnsDHMlcRL426paC6wHbkuyFtgGHKiqNcCBtg9wPbCmvbYCd8NMqADbgWuAq4Htp4KlzfnwwLqNQ/QrSTpL8w6Jqnq+qr7btv8beBpYDmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkETgnn0kkWQW8G3gEWFZVz7ehF4BlbXs5cGRg2dFWO1396Bx1SdKIDB0SSd4C/Avwsap6eXCsXQHUsOd4DT1sTTKZZHJ6evr1Pp0kXTCGCokkv8tMQHylqr7eyj9qt4po7y+2+jFg5cDyFa12uvqKOeqvUlU7qmpdVa2bmJgY5keSJA0Y5ummAPcAT1fVPw4M7QFOPaG0GXhgoH5Le8ppPfBSuy21D7g2ydL2gfW1wL429nKS9e1ctwwcS5I0AouHWPte4C+BJ5N8r9X+HvgMcF+SLcBzwIfa2F7gBmAK+BlwK0BVHU/yKeBgm/fJqjretj8CfAm4CHiwvSRJIzLvkKiqfwN631vYMMf8Am7rHGsnsHOO+iRw5Xx7lCQNx29cS5K6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSuhZ8SCTZmOSZJFNJto27H0m6kCzokEiyCPg8cD2wFrg5ydrxdiVJF44FHRLA1cBUVT1bVa8Au4FNY+5Jki4Yi8fdwBksB44M7B8Frpk9KclWYGvb/Z8kz4ygtwvFZcCPx93E6eSz4+5AY7Lg/23+lvm9uYoLPSRek6raAewYdx/noySTVbVu3H1Is/lvczQW+u2mY8DKgf0VrSZJGoGFHhIHgTVJVidZAtwE7BlzT5J0wVjQt5uq6mSS24F9wCJgZ1UdGnNbFxpv42mh8t/mCKSqxt2DJGmBWui3myRJY2RISJK6DAlJUteC/uBao5Xkncx8o315Kx0D9lTV0+PrStI4eSUhAJJ8nJk/exLg0fYK8FX/sKIWsiS3jruH85lPNwmAJP8JvKuqfjmrvgQ4VFVrxtOZdHpJflhVV4y7j/OVt5t0yq+BtwPPzapf3saksUnyRG8IWDbKXi40hoRO+RhwIMlhfvNHFa8A3gHcPq6mpGYZcB1wYlY9wL+Pvp0LhyEhAKrqW0l+n5k/zz74wfXBqvrV+DqTAPgG8Jaq+t7sgSQPjbybC4ifSUiSuny6SZLUZUhIkroMCUlSlyEhSeoyJCRJXf8Hd8Pz8m1yqC0AAAAASUVORK5CYII=", "text/plain": [ "
" ] @@ -192,10 +192,157 @@ "source": [ "# Your response here\n", "import matplotlib.pyplot as plt\n", - "plt.hist(data['isFraud'])\n", + "count_class = pd.value_counts(data['isFraud'])\n", + "count_class.plot(kind='bar')\n", "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
13436411CASH_IN63402.09C1610152695220.0063622.09C2116285782764063.111397717.7200
2805349225PAYMENT35153.23C2146817319149911.95114758.72M14266979740.000.0000
2624794208CASH_OUT173949.92C822459157229678.1555728.23C7931165967751776.837925726.7500
1906681166CASH_OUT23752.01C1116951121376.000.00C378897201385481.2974485.5400
4100102301CASH_IN313576.86C149486285421473405.0421786981.91C12530405157963543.317649966.4400
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "134364 11 CASH_IN 63402.09 C1610152695 220.00 \n", + "2805349 225 PAYMENT 35153.23 C2146817319 149911.95 \n", + "2624794 208 CASH_OUT 173949.92 C822459157 229678.15 \n", + "1906681 166 CASH_OUT 23752.01 C11169511 21376.00 \n", + "4100102 301 CASH_IN 313576.86 C1494862854 21473405.04 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "134364 63622.09 C2116285782 764063.11 1397717.72 0 \n", + "2805349 114758.72 M1426697974 0.00 0.00 0 \n", + "2624794 55728.23 C793116596 7751776.83 7925726.75 0 \n", + "1906681 0.00 C378897201 385481.29 74485.54 0 \n", + "4100102 21786981.91 C1253040515 7963543.31 7649966.44 0 \n", + "\n", + " isFlaggedFraud \n", + "134364 0 \n", + "2805349 0 \n", + "2624794 0 \n", + "1906681 0 \n", + "4100102 0 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -205,7 +352,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.drop(columns=['type', 'nameOrig','nameDest'])\n", + "data['step'] = data['step'].astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -243,77 +400,77 @@ " \n", " step\n", " 1.000000\n", - " 0.022924\n", - " -0.011487\n", - " -0.011557\n", - " 0.024719\n", - " 0.023271\n", - " 0.032341\n", + " 0.026004\n", + " -0.013405\n", + " -0.013704\n", + " 0.030670\n", + " 0.029147\n", + " 0.035254\n", " NaN\n", " \n", " \n", " amount\n", - " 0.022924\n", + " 0.026004\n", " 1.000000\n", - " -0.005262\n", - " -0.009000\n", - " 0.303607\n", - " 0.473470\n", - " 0.063983\n", + " -0.003682\n", + " -0.008972\n", + " 0.295318\n", + " 0.466679\n", + " 0.078436\n", " NaN\n", " \n", " \n", " oldbalanceOrg\n", - " -0.011487\n", - " -0.005262\n", + " -0.013405\n", + " -0.003682\n", " 1.000000\n", - " 0.998966\n", - " 0.063826\n", - " 0.039185\n", - " 0.005648\n", + " 0.998807\n", + " 0.066678\n", + " 0.041310\n", + " 0.010898\n", " NaN\n", " \n", " \n", " newbalanceOrig\n", - " -0.011557\n", - " -0.009000\n", - " 0.998966\n", + " -0.013704\n", + " -0.008972\n", + " 0.998807\n", " 1.000000\n", - " 0.065541\n", - " 0.039345\n", - " -0.010291\n", + " 0.068186\n", + " 0.040854\n", + " -0.008077\n", " NaN\n", " \n", " \n", " oldbalanceDest\n", - " 0.024719\n", - " 0.303607\n", - " 0.063826\n", - " 0.065541\n", + " 0.030670\n", + " 0.295318\n", + " 0.066678\n", + " 0.068186\n", " 1.000000\n", - " 0.976133\n", - " -0.007597\n", + " 0.974118\n", + " -0.006947\n", " NaN\n", " \n", " \n", " newbalanceDest\n", - " 0.023271\n", - " 0.473470\n", - " 0.039185\n", - " 0.039345\n", - " 0.976133\n", + " 0.029147\n", + " 0.466679\n", + " 0.041310\n", + " 0.040854\n", + " 0.974118\n", " 1.000000\n", - " -0.003181\n", + " 0.001060\n", " NaN\n", " \n", " \n", " isFraud\n", - " 0.032341\n", - " 0.063983\n", - " 0.005648\n", - " -0.010291\n", - " -0.007597\n", - " -0.003181\n", + " 0.035254\n", + " 0.078436\n", + " 0.010898\n", + " -0.008077\n", + " -0.006947\n", + " 0.001060\n", " 1.000000\n", " NaN\n", " \n", @@ -334,27 +491,27 @@ ], "text/plain": [ " step amount oldbalanceOrg newbalanceOrig \\\n", - "step 1.000000 0.022924 -0.011487 -0.011557 \n", - "amount 0.022924 1.000000 -0.005262 -0.009000 \n", - "oldbalanceOrg -0.011487 -0.005262 1.000000 0.998966 \n", - "newbalanceOrig -0.011557 -0.009000 0.998966 1.000000 \n", - "oldbalanceDest 0.024719 0.303607 0.063826 0.065541 \n", - "newbalanceDest 0.023271 0.473470 0.039185 0.039345 \n", - "isFraud 0.032341 0.063983 0.005648 -0.010291 \n", + "step 1.000000 0.026004 -0.013405 -0.013704 \n", + "amount 0.026004 1.000000 -0.003682 -0.008972 \n", + "oldbalanceOrg -0.013405 -0.003682 1.000000 0.998807 \n", + "newbalanceOrig -0.013704 -0.008972 0.998807 1.000000 \n", + "oldbalanceDest 0.030670 0.295318 0.066678 0.068186 \n", + "newbalanceDest 0.029147 0.466679 0.041310 0.040854 \n", + "isFraud 0.035254 0.078436 0.010898 -0.008077 \n", "isFlaggedFraud NaN NaN NaN NaN \n", "\n", " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", - "step 0.024719 0.023271 0.032341 NaN \n", - "amount 0.303607 0.473470 0.063983 NaN \n", - "oldbalanceOrg 0.063826 0.039185 0.005648 NaN \n", - "newbalanceOrig 0.065541 0.039345 -0.010291 NaN \n", - "oldbalanceDest 1.000000 0.976133 -0.007597 NaN \n", - "newbalanceDest 0.976133 1.000000 -0.003181 NaN \n", - "isFraud -0.007597 -0.003181 1.000000 NaN \n", + "step 0.030670 0.029147 0.035254 NaN \n", + "amount 0.295318 0.466679 0.078436 NaN \n", + "oldbalanceOrg 0.066678 0.041310 0.010898 NaN \n", + "newbalanceOrig 0.068186 0.040854 -0.008077 NaN \n", + "oldbalanceDest 1.000000 0.974118 -0.006947 NaN \n", + "newbalanceDest 0.974118 1.000000 0.001060 NaN \n", + "isFraud -0.006947 0.001060 1.000000 NaN \n", "isFlaggedFraud NaN NaN NaN NaN " ] }, - "execution_count": 10, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -373,16 +530,16 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.99868" + "0.99808" ] }, - "execution_count": 16, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -391,8 +548,8 @@ "# Your code here\n", "from sklearn.linear_model import LogisticRegression\n", "\n", - "X = pd.DataFrame(data['amount'])\n", - "y = pd.Series(data['isFraud'], name='labels')\n", + "X = data.drop(columns=['isFraud'])\n", + "y = data['isFraud']\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", @@ -403,6 +560,55 @@ "model.score(X_test,y_test)" ] }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 24961\n", + " 1 0.41 0.51 0.45 39\n", + "\n", + " accuracy 1.00 25000\n", + " macro avg 0.70 0.76 0.73 25000\n", + "weighted avg 1.00 1.00 1.00 25000\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report, confusion_matrix\n", + "\n", + "pred = model.predict(X_test)\n", + "print(classification_report(y_test,pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[24932, 29],\n", + " [ 19, 20]], dtype=int64)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confusion_matrix(y_test,pred)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -412,27 +618,59 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 29, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.91 0.95 24961\n", + " 1 0.02 1.00 0.03 39\n", + "\n", + " accuracy 0.91 25000\n", + " macro avg 0.51 0.95 0.49 25000\n", + "weighted avg 1.00 0.91 0.95 25000\n", + "\n" + ] + }, { "data": { "text/plain": [ - "0.99764" + "array([[22681, 2280],\n", + " [ 0, 39]], dtype=int64)" ] }, - "execution_count": 18, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Your code here\n", - "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.utils import resample\n", "\n", - "tree = DecisionTreeClassifier(random_state=1)\n", - "tree.fit(X_train,y_train)\n", - "tree.score(X_test,y_test)" + "train = pd.concat([X_train,y_train], axis=1)\n", + "\n", + "no_fraud = train[train['isFraud'] == 0]\n", + "yes_fraud = train[train['isFraud'] == 1]\n", + "\n", + "yes_fraud_oversampled = resample(yes_fraud, \n", + "replace=True, \n", + "n_samples=len(no_fraud), \n", + "random_state=0)\n", + "\n", + "train_oversampled = pd.concat([no_fraud,yes_fraud_oversampled])\n", + "\n", + "X_train_over = train_oversampled.drop(columns=['isFraud'])\n", + "y_train_over = train_oversampled['isFraud']\n", + "\n", + "model.fit(X_train_over,y_train_over)\n", + "pred = model.predict(X_test)\n", + "\n", + "print(classification_report(y_test,pred))\n", + "confusion_matrix(y_test,pred)" ] }, { @@ -444,12 +682,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# Your response here\n", - "#the logistic regression as the decision tree is slightly over fiited" + "\n", + "## The problem here with the logistic regression is that it doest not catch the frauds at all, it has 0 recall so our model is horrible\n", + "## even though we are very precise because of the inbalanced data..\n", + "\n", + "## meanwhile the second method we loose a litle bit of precision but we managed to capture everything single fraud and therefore save a lot of money." ] }, {