diff --git a/.gitignore b/.gitignore
index 0b0271f..b463273 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+PS_20174392719_1491204439457_log
+
# Created by https://www.gitignore.io/api/macos,pycharm,visualstudio,jupyternotebook,visualstudiocode
# Edit at https://www.gitignore.io/?templates=macos,pycharm,visualstudio,jupyternotebook,visualstudiocode
diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..8c7601e 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -28,11 +28,118 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 134364 | \n",
+ " 11 | \n",
+ " CASH_IN | \n",
+ " 63402.09 | \n",
+ " C1610152695 | \n",
+ " 220.00 | \n",
+ " 63622.09 | \n",
+ " C2116285782 | \n",
+ " 764063.11 | \n",
+ " 1397717.72 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2805349 | \n",
+ " 225 | \n",
+ " PAYMENT | \n",
+ " 35153.23 | \n",
+ " C2146817319 | \n",
+ " 149911.95 | \n",
+ " 114758.72 | \n",
+ " M1426697974 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2624794 | \n",
+ " 208 | \n",
+ " CASH_OUT | \n",
+ " 173949.92 | \n",
+ " C822459157 | \n",
+ " 229678.15 | \n",
+ " 55728.23 | \n",
+ " C793116596 | \n",
+ " 7751776.83 | \n",
+ " 7925726.75 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg \\\n",
+ "134364 11 CASH_IN 63402.09 C1610152695 220.00 \n",
+ "2805349 225 PAYMENT 35153.23 C2146817319 149911.95 \n",
+ "2624794 208 CASH_OUT 173949.92 C822459157 229678.15 \n",
+ "\n",
+ " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
+ "134364 63622.09 C2116285782 764063.11 1397717.72 0 \n",
+ "2805349 114758.72 M1426697974 0.00 0.00 0 \n",
+ "2624794 55728.23 C793116596 7751776.83 7925726.75 0 \n",
+ "\n",
+ " isFlaggedFraud \n",
+ "134364 0 \n",
+ "2805349 0 \n",
+ "2624794 0 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.read_csv(r'C:\\Users\\Daniel Carvalho\\Desktop\\DataAnalysis\\WEEK7\\DAY4\\labs\\lab-imbalance\\PS_20174392719_1491204439457_log.csv').sample(n=100000)\n",
+ "data.head(3)"
]
},
{
@@ -44,11 +151,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 99857\n",
+ "1 143\n",
+ "Name: isFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your response here"
+ "data['isFraud'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD1CAYAAAClSgmzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAO20lEQVR4nO3dcaidd33H8fdnyaJVqU3tJdQkXQJmk1gY1tBmCGOY0aZ1LP1DpWWsoQTzh+2mYzDj/gmoBYWxzoIWgs1MRYylExo0NYRoGWO0za2V1rTrcqmrSWjt1cR2m2iNfvfH/WUebu8vae5Jz7km7xcczvN8f7/f83wvhPvp85zn3KaqkCRpLr8z7gYkSQuXISFJ6jIkJEldhoQkqcuQkCR1GRKSpK7F427gXLvssstq1apV425Dkn6rPPbYYz+uqonZ9fMuJFatWsXk5OS425Ck3ypJnpur7u0mSVKXISFJ6jIkJEldhoQkqeuMIZFkZ5IXk3x/oHZpkv1JDrf3pa2eJHclmUryRJKrBtZsbvMPJ9k8UH9PkifbmruS5HTnkCSNzmu5kvgSsHFWbRtwoKrWAAfaPsD1wJr22grcDTO/8IHtwDXA1cD2gV/6dwMfHli38QznkCSNyBlDoqr+FTg+q7wJ2NW2dwE3DtTvrRkPA5ckuRy4DthfVcer6gSwH9jYxi6uqodr5m+W3zvrWHOdQ5I0IvP9TGJZVT3ftl8AlrXt5cCRgXlHW+109aNz1E93DknSiAz9ZbqqqiSv6/+56EznSLKVmdtbXHHFFa9nK+fMqm3fHHcL543/+sz7x92CdN6a75XEj9qtItr7i61+DFg5MG9Fq52uvmKO+unO8SpVtaOq1lXVuomJV32rXJI0T/MNiT3AqSeUNgMPDNRvaU85rQdeareM9gHXJlnaPrC+FtjXxl5Osr491XTLrGPNdQ5J0oic8XZTkq8CfwJcluQoM08pfQa4L8kW4DngQ236XuAGYAr4GXArQFUdT/Ip4GCb98mqOvVh+EeYeYLqIuDB9uI055AkjcgZQ6Kqbu4MbZhjbgG3dY6zE9g5R30SuHKO+k/mOockaXT8xrUkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKlrqJBI8jdJDiX5fpKvJnljktVJHkkyleRrSZa0uW9o+1NtfNXAcT7R6s8kuW6gvrHVppJsG6ZXSdLZm3dIJFkO/DWwrqquBBYBNwGfBe6sqncAJ4AtbckW4ESr39nmkWRtW/cuYCPwhSSLkiwCPg9cD6wFbm5zJUkjMuztpsXARUkWA28CngfeB9zfxncBN7btTW2fNr4hSVp9d1X9oqp+AEwBV7fXVFU9W1WvALvbXEnSiMw7JKrqGPAPwA+ZCYeXgMeAn1bVyTbtKLC8bS8HjrS1J9v8tw3WZ63p1SVJIzLM7aalzPyX/Wrg7cCbmbldNHJJtiaZTDI5PT09jhYk6bw0zO2mPwV+UFXTVfVL4OvAe4FL2u0ngBXAsbZ9DFgJ0MbfCvxksD5rTa/+KlW1o6rWVdW6iYmJIX4kSdKgYULih8D6JG9qny1sAJ4CvgN8oM3ZDDzQtve0fdr4t6uqWv2m9vTTamAN8ChwEFjTnpZawsyH23uG6FeSdJYWn3nK3KrqkST3A98FTgKPAzuAbwK7k3y61e5pS+4BvpxkCjjOzC99qupQkvuYCZiTwG1V9SuAJLcD+5h5cmpnVR2ab7+SpLM375AAqKrtwPZZ5WeZeTJp9tyfAx/sHOcO4I456nuBvcP0KEmaP79xLUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXUCGR5JIk9yf5jyRPJ/mjJJcm2Z/kcHtf2uYmyV1JppI8keSqgeNsbvMPJ9k8UH9PkifbmruSZJh+JUlnZ9gric8B36qqdwJ/CDwNbAMOVNUa4EDbB7geWNNeW4G7AZJcCmwHrgGuBrafCpY258MD6zYO2a8k6SzMOySSvBX4Y+AegKp6pap+CmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkERjmSmI1MA38c5LHk3wxyZuBZVX1fJvzArCsbS8HjgysP9pqp6sfnaMuSRqRYUJiMXAVcHdVvRv4X35zawmAdgVQQ5zjNUmyNclkksnp6enX+3SSdMEYJiSOAker6pG2fz8zofGjdquI9v5iGz8GrBxYv6LVTldfMUf9VapqR1Wtq6p1ExMTQ/xIkqRB8w6JqnoBOJLkD1ppA/AUsAc49YTSZuCBtr0HuKU95bQeeKndltoHXJtkafvA+lpgXxt7Ocn69lTTLQPHkiSNwOIh1/8V8JUkS4BngVuZCZ77kmwBngM+1ObuBW4ApoCftblU1fEknwIOtnmfrKrjbfsjwJeAi4AH20uSNCJDhURVfQ9YN8fQhjnmFnBb5zg7gZ1z1CeBK4fpUZI0f37jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1DV0SCRZlOTxJN9o+6uTPJJkKsnXkixp9Te0/ak2vmrgGJ9o9WeSXDdQ39hqU0m2DdurJOnsnIsriY8CTw/sfxa4s6reAZwAtrT6FuBEq9/Z5pFkLXAT8C5gI/CFFjyLgM8D1wNrgZvbXEnSiAwVEklWAO8Hvtj2A7wPuL9N2QXc2LY3tX3a+IY2fxOwu6p+UVU/AKaAq9trqqqerapXgN1triRpRIa9kvgn4O+AX7f9twE/raqTbf8osLxtLweOALTxl9r8/6/PWtOrv0qSrUkmk0xOT08P+SNJkk6Zd0gk+TPgxap67Bz2My9VtaOq1lXVuomJiXG3I0nnjcVDrH0v8OdJbgDeCFwMfA64JMnidrWwAjjW5h8DVgJHkywG3gr8ZKB+yuCaXl2SNALzvpKoqk9U1YqqWsXMB8/frqq/AL4DfKBN2ww80Lb3tH3a+Lerqlr9pvb002pgDfAocBBY056WWtLOsWe+/UqSzt4wVxI9Hwd2J/k08DhwT6vfA3w5yRRwnJlf+lTVoST3AU8BJ4HbqupXAEluB/YBi4CdVXXodehXktRxTkKiqh4CHmrbzzLzZNLsOT8HPthZfwdwxxz1vcDec9GjJOns+Y1rSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrrmHRJJVib5TpKnkhxK8tFWvzTJ/iSH2/vSVk+Su5JMJXkiyVUDx9rc5h9Osnmg/p4kT7Y1dyXJMD+sJOnsDHMlcRL426paC6wHbkuyFtgGHKiqNcCBtg9wPbCmvbYCd8NMqADbgWuAq4Htp4KlzfnwwLqNQ/QrSTpL8w6Jqnq+qr7btv8beBpYDmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkETgnn0kkWQW8G3gEWFZVz7ehF4BlbXs5cGRg2dFWO1396Bx1SdKIDB0SSd4C/Avwsap6eXCsXQHUsOd4DT1sTTKZZHJ6evr1Pp0kXTCGCokkv8tMQHylqr7eyj9qt4po7y+2+jFg5cDyFa12uvqKOeqvUlU7qmpdVa2bmJgY5keSJA0Y5ummAPcAT1fVPw4M7QFOPaG0GXhgoH5Le8ppPfBSuy21D7g2ydL2gfW1wL429nKS9e1ctwwcS5I0AouHWPte4C+BJ5N8r9X+HvgMcF+SLcBzwIfa2F7gBmAK+BlwK0BVHU/yKeBgm/fJqjretj8CfAm4CHiwvSRJIzLvkKiqfwN631vYMMf8Am7rHGsnsHOO+iRw5Xx7lCQNx29cS5K6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSuhZ8SCTZmOSZJFNJto27H0m6kCzokEiyCPg8cD2wFrg5ydrxdiVJF44FHRLA1cBUVT1bVa8Au4FNY+5Jki4Yi8fdwBksB44M7B8Frpk9KclWYGvb/Z8kz4ygtwvFZcCPx93E6eSz4+5AY7Lg/23+lvm9uYoLPSRek6raAewYdx/noySTVbVu3H1Is/lvczQW+u2mY8DKgf0VrSZJGoGFHhIHgTVJVidZAtwE7BlzT5J0wVjQt5uq6mSS24F9wCJgZ1UdGnNbFxpv42mh8t/mCKSqxt2DJGmBWui3myRJY2RISJK6DAlJUteC/uBao5Xkncx8o315Kx0D9lTV0+PrStI4eSUhAJJ8nJk/exLg0fYK8FX/sKIWsiS3jruH85lPNwmAJP8JvKuqfjmrvgQ4VFVrxtOZdHpJflhVV4y7j/OVt5t0yq+BtwPPzapf3saksUnyRG8IWDbKXi40hoRO+RhwIMlhfvNHFa8A3gHcPq6mpGYZcB1wYlY9wL+Pvp0LhyEhAKrqW0l+n5k/zz74wfXBqvrV+DqTAPgG8Jaq+t7sgSQPjbybC4ifSUiSuny6SZLUZUhIkroMCUlSlyEhSeoyJCRJXf8Hd8Pz8m1yqC0AAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Your response here\n",
+ "import matplotlib.pyplot as plt\n",
+ "count_class = pd.value_counts(data['isFraud'])\n",
+ "count_class.plot(kind='bar')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 134364 | \n",
+ " 11 | \n",
+ " CASH_IN | \n",
+ " 63402.09 | \n",
+ " C1610152695 | \n",
+ " 220.00 | \n",
+ " 63622.09 | \n",
+ " C2116285782 | \n",
+ " 764063.11 | \n",
+ " 1397717.72 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2805349 | \n",
+ " 225 | \n",
+ " PAYMENT | \n",
+ " 35153.23 | \n",
+ " C2146817319 | \n",
+ " 149911.95 | \n",
+ " 114758.72 | \n",
+ " M1426697974 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2624794 | \n",
+ " 208 | \n",
+ " CASH_OUT | \n",
+ " 173949.92 | \n",
+ " C822459157 | \n",
+ " 229678.15 | \n",
+ " 55728.23 | \n",
+ " C793116596 | \n",
+ " 7751776.83 | \n",
+ " 7925726.75 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1906681 | \n",
+ " 166 | \n",
+ " CASH_OUT | \n",
+ " 23752.01 | \n",
+ " C11169511 | \n",
+ " 21376.00 | \n",
+ " 0.00 | \n",
+ " C378897201 | \n",
+ " 385481.29 | \n",
+ " 74485.54 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4100102 | \n",
+ " 301 | \n",
+ " CASH_IN | \n",
+ " 313576.86 | \n",
+ " C1494862854 | \n",
+ " 21473405.04 | \n",
+ " 21786981.91 | \n",
+ " C1253040515 | \n",
+ " 7963543.31 | \n",
+ " 7649966.44 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg \\\n",
+ "134364 11 CASH_IN 63402.09 C1610152695 220.00 \n",
+ "2805349 225 PAYMENT 35153.23 C2146817319 149911.95 \n",
+ "2624794 208 CASH_OUT 173949.92 C822459157 229678.15 \n",
+ "1906681 166 CASH_OUT 23752.01 C11169511 21376.00 \n",
+ "4100102 301 CASH_IN 313576.86 C1494862854 21473405.04 \n",
+ "\n",
+ " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
+ "134364 63622.09 C2116285782 764063.11 1397717.72 0 \n",
+ "2805349 114758.72 M1426697974 0.00 0.00 0 \n",
+ "2624794 55728.23 C793116596 7751776.83 7925726.75 0 \n",
+ "1906681 0.00 C378897201 385481.29 74485.54 0 \n",
+ "4100102 21786981.91 C1253040515 7963543.31 7649966.44 0 \n",
+ "\n",
+ " isFlaggedFraud \n",
+ "134364 0 \n",
+ "2805349 0 \n",
+ "2624794 0 \n",
+ "1906681 0 \n",
+ "4100102 0 "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
]
},
{
@@ -60,11 +352,173 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "data = data.drop(columns=['type', 'nameOrig','nameDest'])\n",
+ "data['step'] = data['step'].astype(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | step | \n",
+ " 1.000000 | \n",
+ " 0.026004 | \n",
+ " -0.013405 | \n",
+ " -0.013704 | \n",
+ " 0.030670 | \n",
+ " 0.029147 | \n",
+ " 0.035254 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | amount | \n",
+ " 0.026004 | \n",
+ " 1.000000 | \n",
+ " -0.003682 | \n",
+ " -0.008972 | \n",
+ " 0.295318 | \n",
+ " 0.466679 | \n",
+ " 0.078436 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | oldbalanceOrg | \n",
+ " -0.013405 | \n",
+ " -0.003682 | \n",
+ " 1.000000 | \n",
+ " 0.998807 | \n",
+ " 0.066678 | \n",
+ " 0.041310 | \n",
+ " 0.010898 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | newbalanceOrig | \n",
+ " -0.013704 | \n",
+ " -0.008972 | \n",
+ " 0.998807 | \n",
+ " 1.000000 | \n",
+ " 0.068186 | \n",
+ " 0.040854 | \n",
+ " -0.008077 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | oldbalanceDest | \n",
+ " 0.030670 | \n",
+ " 0.295318 | \n",
+ " 0.066678 | \n",
+ " 0.068186 | \n",
+ " 1.000000 | \n",
+ " 0.974118 | \n",
+ " -0.006947 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | newbalanceDest | \n",
+ " 0.029147 | \n",
+ " 0.466679 | \n",
+ " 0.041310 | \n",
+ " 0.040854 | \n",
+ " 0.974118 | \n",
+ " 1.000000 | \n",
+ " 0.001060 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | isFraud | \n",
+ " 0.035254 | \n",
+ " 0.078436 | \n",
+ " 0.010898 | \n",
+ " -0.008077 | \n",
+ " -0.006947 | \n",
+ " 0.001060 | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | isFlaggedFraud | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig \\\n",
+ "step 1.000000 0.026004 -0.013405 -0.013704 \n",
+ "amount 0.026004 1.000000 -0.003682 -0.008972 \n",
+ "oldbalanceOrg -0.013405 -0.003682 1.000000 0.998807 \n",
+ "newbalanceOrig -0.013704 -0.008972 0.998807 1.000000 \n",
+ "oldbalanceDest 0.030670 0.295318 0.066678 0.068186 \n",
+ "newbalanceDest 0.029147 0.466679 0.041310 0.040854 \n",
+ "isFraud 0.035254 0.078436 0.010898 -0.008077 \n",
+ "isFlaggedFraud NaN NaN NaN NaN \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "step 0.030670 0.029147 0.035254 NaN \n",
+ "amount 0.295318 0.466679 0.078436 NaN \n",
+ "oldbalanceOrg 0.066678 0.041310 0.010898 NaN \n",
+ "newbalanceOrig 0.068186 0.040854 -0.008077 NaN \n",
+ "oldbalanceDest 1.000000 0.974118 -0.006947 NaN \n",
+ "newbalanceDest 0.974118 1.000000 0.001060 NaN \n",
+ "isFraud -0.006947 0.001060 1.000000 NaN \n",
+ "isFlaggedFraud NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here\n",
+ "data.corr()"
]
},
{
@@ -76,11 +530,83 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.99808"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "X = data.drop(columns=['isFraud'])\n",
+ "y = data['isFraud']\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)\n",
+ "\n",
+ "model = LogisticRegression()\n",
+ "model.fit(X_train, y_train)\n",
+ "model.score(X_test,y_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 1.00 1.00 24961\n",
+ " 1 0.41 0.51 0.45 39\n",
+ "\n",
+ " accuracy 1.00 25000\n",
+ " macro avg 0.70 0.76 0.73 25000\n",
+ "weighted avg 1.00 1.00 1.00 25000\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "from sklearn.metrics import classification_report, confusion_matrix\n",
+ "\n",
+ "pred = model.predict(X_test)\n",
+ "print(classification_report(y_test,pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[24932, 29],\n",
+ " [ 19, 20]], dtype=int64)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "confusion_matrix(y_test,pred)"
]
},
{
@@ -92,11 +618,59 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 0.91 0.95 24961\n",
+ " 1 0.02 1.00 0.03 39\n",
+ "\n",
+ " accuracy 0.91 25000\n",
+ " macro avg 0.51 0.95 0.49 25000\n",
+ "weighted avg 1.00 0.91 0.95 25000\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[22681, 2280],\n",
+ " [ 0, 39]], dtype=int64)"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "from sklearn.utils import resample\n",
+ "\n",
+ "train = pd.concat([X_train,y_train], axis=1)\n",
+ "\n",
+ "no_fraud = train[train['isFraud'] == 0]\n",
+ "yes_fraud = train[train['isFraud'] == 1]\n",
+ "\n",
+ "yes_fraud_oversampled = resample(yes_fraud, \n",
+ "replace=True, \n",
+ "n_samples=len(no_fraud), \n",
+ "random_state=0)\n",
+ "\n",
+ "train_oversampled = pd.concat([no_fraud,yes_fraud_oversampled])\n",
+ "\n",
+ "X_train_over = train_oversampled.drop(columns=['isFraud'])\n",
+ "y_train_over = train_oversampled['isFraud']\n",
+ "\n",
+ "model.fit(X_train_over,y_train_over)\n",
+ "pred = model.predict(X_test)\n",
+ "\n",
+ "print(classification_report(y_test,pred))\n",
+ "confusion_matrix(y_test,pred)"
]
},
{
@@ -108,11 +682,16 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "# Your response here\n",
+ "\n",
+ "## The problem here with the logistic regression is that it doest not catch the frauds at all, it has 0 recall so our model is horrible\n",
+ "## even though we are very precise because of the inbalanced data..\n",
+ "\n",
+ "## meanwhile the second method we loose a litle bit of precision but we managed to capture everything single fraud and therefore save a lot of money."
]
},
{
@@ -125,7 +704,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.9.13 ('CLasses')",
"language": "python",
"name": "python3"
},
@@ -139,7 +718,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "fd8103c585cc5071142faf5ebc004de0b2da2b7faa17a9198853f983d8f17421"
+ }
}
},
"nbformat": 4,