diff --git a/your-code/.gitignore b/your-code/.gitignore new file mode 100644 index 0000000..c5ef2b7 --- /dev/null +++ b/your-code/.gitignore @@ -0,0 +1 @@ +archive.zip diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..0351f50 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,132 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(100000, 11)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
01PAYMENT9839.64C1231006815170136.0160296.36M19797871550.00.000
11PAYMENT1864.28C166654429521249.019384.72M20442822250.00.000
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n", + "0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 \n", + "1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 \n", + "\n", + " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "0 M1979787155 0.0 0.0 0 0 \n", + "1 M2044282225 0.0 0.0 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your code here" + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from imblearn.over_sampling import SMOTE\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "data = pd.read_csv(r'C:\\Users\\Francesco M\\Documents\\IronHack Syllabus\\Labs\\Module 3\\lab-imbalance\\your-code\\archive.zip', nrows=100000)\n", + "print(data.shape)\n", + "display(data.head(2))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 99884\n", + "1 116\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['isFraud'].value_counts()" ] }, { @@ -44,11 +165,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD1CAYAAAClSgmzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAO20lEQVR4nO3dcaidd33H8fdnyaJVqU3tJdQkXQJmk1gY1tBmCGOY0aZ1LP1DpWWsoQTzh+2mYzDj/gmoBYWxzoIWgs1MRYylExo0NYRoGWO0za2V1rTrcqmrSWjt1cR2m2iNfvfH/WUebu8vae5Jz7km7xcczvN8f7/f83wvhPvp85zn3KaqkCRpLr8z7gYkSQuXISFJ6jIkJEldhoQkqcuQkCR1GRKSpK7F427gXLvssstq1apV425Dkn6rPPbYYz+uqonZ9fMuJFatWsXk5OS425Ck3ypJnpur7u0mSVKXISFJ6jIkJEldhoQkqeuMIZFkZ5IXk3x/oHZpkv1JDrf3pa2eJHclmUryRJKrBtZsbvMPJ9k8UH9PkifbmruS5HTnkCSNzmu5kvgSsHFWbRtwoKrWAAfaPsD1wJr22grcDTO/8IHtwDXA1cD2gV/6dwMfHli38QznkCSNyBlDoqr+FTg+q7wJ2NW2dwE3DtTvrRkPA5ckuRy4DthfVcer6gSwH9jYxi6uqodr5m+W3zvrWHOdQ5I0IvP9TGJZVT3ftl8AlrXt5cCRgXlHW+109aNz1E93DknSiAz9ZbqqqiSv6/+56EznSLKVmdtbXHHFFa9nK+fMqm3fHHcL543/+sz7x92CdN6a75XEj9qtItr7i61+DFg5MG9Fq52uvmKO+unO8SpVtaOq1lXVuomJV32rXJI0T/MNiT3AqSeUNgMPDNRvaU85rQdeareM9gHXJlnaPrC+FtjXxl5Osr491XTLrGPNdQ5J0oic8XZTkq8CfwJcluQoM08pfQa4L8kW4DngQ236XuAGYAr4GXArQFUdT/Ip4GCb98mqOvVh+EeYeYLqIuDB9uI055AkjcgZQ6Kqbu4MbZhjbgG3dY6zE9g5R30SuHKO+k/mOockaXT8xrUkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKlrqJBI8jdJDiX5fpKvJnljktVJHkkyleRrSZa0uW9o+1NtfNXAcT7R6s8kuW6gvrHVppJsG6ZXSdLZm3dIJFkO/DWwrqquBBYBNwGfBe6sqncAJ4AtbckW4ESr39nmkWRtW/cuYCPwhSSLkiwCPg9cD6wFbm5zJUkjMuztpsXARUkWA28CngfeB9zfxncBN7btTW2fNr4hSVp9d1X9oqp+AEwBV7fXVFU9W1WvALvbXEnSiMw7JKrqGPAPwA+ZCYeXgMeAn1bVyTbtKLC8bS8HjrS1J9v8tw3WZ63p1SVJIzLM7aalzPyX/Wrg7cCbmbldNHJJtiaZTDI5PT09jhYk6bw0zO2mPwV+UFXTVfVL4OvAe4FL2u0ngBXAsbZ9DFgJ0MbfCvxksD5rTa/+KlW1o6rWVdW6iYmJIX4kSdKgYULih8D6JG9qny1sAJ4CvgN8oM3ZDDzQtve0fdr4t6uqWv2m9vTTamAN8ChwEFjTnpZawsyH23uG6FeSdJYWn3nK3KrqkST3A98FTgKPAzuAbwK7k3y61e5pS+4BvpxkCjjOzC99qupQkvuYCZiTwG1V9SuAJLcD+5h5cmpnVR2ab7+SpLM375AAqKrtwPZZ5WeZeTJp9tyfAx/sHOcO4I456nuBvcP0KEmaP79xLUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXUCGR5JIk9yf5jyRPJ/mjJJcm2Z/kcHtf2uYmyV1JppI8keSqgeNsbvMPJ9k8UH9PkifbmruSZJh+JUlnZ9gric8B36qqdwJ/CDwNbAMOVNUa4EDbB7geWNNeW4G7AZJcCmwHrgGuBrafCpY258MD6zYO2a8k6SzMOySSvBX4Y+AegKp6pap+CmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkERjmSmI1MA38c5LHk3wxyZuBZVX1fJvzArCsbS8HjgysP9pqp6sfnaMuSRqRYUJiMXAVcHdVvRv4X35zawmAdgVQQ5zjNUmyNclkksnp6enX+3SSdMEYJiSOAker6pG2fz8zofGjdquI9v5iGz8GrBxYv6LVTldfMUf9VapqR1Wtq6p1ExMTQ/xIkqRB8w6JqnoBOJLkD1ppA/AUsAc49YTSZuCBtr0HuKU95bQeeKndltoHXJtkafvA+lpgXxt7Ocn69lTTLQPHkiSNwOIh1/8V8JUkS4BngVuZCZ77kmwBngM+1ObuBW4ApoCftblU1fEknwIOtnmfrKrjbfsjwJeAi4AH20uSNCJDhURVfQ9YN8fQhjnmFnBb5zg7gZ1z1CeBK4fpUZI0f37jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1DV0SCRZlOTxJN9o+6uTPJJkKsnXkixp9Te0/ak2vmrgGJ9o9WeSXDdQ39hqU0m2DdurJOnsnIsriY8CTw/sfxa4s6reAZwAtrT6FuBEq9/Z5pFkLXAT8C5gI/CFFjyLgM8D1wNrgZvbXEnSiAwVEklWAO8Hvtj2A7wPuL9N2QXc2LY3tX3a+IY2fxOwu6p+UVU/AKaAq9trqqqerapXgN1triRpRIa9kvgn4O+AX7f9twE/raqTbf8osLxtLweOALTxl9r8/6/PWtOrv0qSrUkmk0xOT08P+SNJkk6Zd0gk+TPgxap67Bz2My9VtaOq1lXVuomJiXG3I0nnjcVDrH0v8OdJbgDeCFwMfA64JMnidrWwAjjW5h8DVgJHkywG3gr8ZKB+yuCaXl2SNALzvpKoqk9U1YqqWsXMB8/frqq/AL4DfKBN2ww80Lb3tH3a+Lerqlr9pvb002pgDfAocBBY056WWtLOsWe+/UqSzt4wVxI9Hwd2J/k08DhwT6vfA3w5yRRwnJlf+lTVoST3AU8BJ4HbqupXAEluB/YBi4CdVXXodehXktRxTkKiqh4CHmrbzzLzZNLsOT8HPthZfwdwxxz1vcDec9GjJOns+Y1rSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrrmHRJJVib5TpKnkhxK8tFWvzTJ/iSH2/vSVk+Su5JMJXkiyVUDx9rc5h9Osnmg/p4kT7Y1dyXJMD+sJOnsDHMlcRL426paC6wHbkuyFtgGHKiqNcCBtg9wPbCmvbYCd8NMqADbgWuAq4Htp4KlzfnwwLqNQ/QrSTpL8w6Jqnq+qr7btv8beBpYDmwCdrVpu4Ab2/Ym4N6a8TBwSZLLgeuA/VV1vKpOAPuBjW3s4qp6uKoKuHfgWJKkETgnn0kkWQW8G3gEWFZVz7ehF4BlbXs5cGRg2dFWO1396Bx1SdKIDB0SSd4C/Avwsap6eXCsXQHUsOd4DT1sTTKZZHJ6evr1Pp0kXTCGCokkv8tMQHylqr7eyj9qt4po7y+2+jFg5cDyFa12uvqKOeqvUlU7qmpdVa2bmJgY5keSJA0Y5ummAPcAT1fVPw4M7QFOPaG0GXhgoH5Le8ppPfBSuy21D7g2ydL2gfW1wL429nKS9e1ctwwcS5I0AouHWPte4C+BJ5N8r9X+HvgMcF+SLcBzwIfa2F7gBmAK+BlwK0BVHU/yKeBgm/fJqjretj8CfAm4CHiwvSRJIzLvkKiqfwN631vYMMf8Am7rHGsnsHOO+iRw5Xx7lCQNx29cS5K6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSuhZ8SCTZmOSZJFNJto27H0m6kCzokEiyCPg8cD2wFrg5ydrxdiVJF44FHRLA1cBUVT1bVa8Au4FNY+5Jki4Yi8fdwBksB44M7B8Frpk9KclWYGvb/Z8kz4ygtwvFZcCPx93E6eSz4+5AY7Lg/23+lvm9uYoLPSRek6raAewYdx/noySTVbVu3H1Is/lvczQW+u2mY8DKgf0VrSZJGoGFHhIHgTVJVidZAtwE7BlzT5J0wVjQt5uq6mSS24F9wCJgZ1UdGnNbFxpv42mh8t/mCKSqxt2DJGmBWui3myRJY2RISJK6DAlJUteC/uBao5Xkncx8o315Kx0D9lTV0+PrStI4eSUhAJJ8nJk/exLg0fYK8FX/sKIWsiS3jruH85lPNwmAJP8JvKuqfjmrvgQ4VFVrxtOZdHpJflhVV4y7j/OVt5t0yq+BtwPPzapf3saksUnyRG8IWDbKXi40hoRO+RhwIMlhfvNHFa8A3gHcPq6mpGYZcB1wYlY9wL+Pvp0LhyEhAKrqW0l+n5k/zz74wfXBqvrV+DqTAPgG8Jaq+t7sgSQPjbybC4ifSUiSuny6SZLUZUhIkroMCUlSlyEhSeoyJCRJXf8Hd8Pz8m1yqC0AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outcome:\n", + " 0 99884\n", + "1 116\n", + "Name: isFraud, dtype: int64\n" + ] + } + ], "source": [ - "# Your response here" + "# Your response here\n", + "# Your code here\n", + "count_class = data['isFraud'].value_counts()\n", + "count_class.plot(kind='bar')\n", + "plt.show()\n", + "\n", + "print('Outcome:\\n',count_class)" ] }, { @@ -60,11 +210,189 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
01PAYMENT9839.64C1231006815170136.0160296.36M19797871550.00.000
11PAYMENT1864.28C166654429521249.019384.72M20442822250.00.000
21TRANSFER181.00C1305486145181.00.00C5532640650.00.010
31CASH_OUT181.00C840083671181.00.00C3899701021182.00.010
41PAYMENT11668.14C204853772041554.029885.86M12307017030.00.000
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n", + "0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 \n", + "1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 \n", + "2 1 TRANSFER 181.00 C1305486145 181.0 0.00 \n", + "3 1 CASH_OUT 181.00 C840083671 181.0 0.00 \n", + "4 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 \n", + "\n", + " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "0 M1979787155 0.0 0.0 0 0 \n", + "1 M2044282225 0.0 0.0 0 0 \n", + "2 C553264065 0.0 0.0 1 0 \n", + "3 C38997010 21182.0 0.0 1 0 \n", + "4 M1230701703 0.0 0.0 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(data.dtypes)\n", + "display(data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(149820, 10)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here\n" + "# Your code here\n", + "\n", + "data.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'step'], axis=1, inplace=True)\n", + "data = pd.get_dummies(data)\n", + "data.head(2)\n", + "X = data.drop('isFraud', axis=1)\n", + "y = data['isFraud']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", + "sm = SMOTE(random_state=1, sampling_strategy=1.0)\n", + "X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)\n", + "X_train_SMOTE.shape" ] }, { @@ -76,11 +404,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9146\n", + "0.9366506474435989\n", + "False\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "lrc = LogisticRegression(max_iter=1000)\n", + "lrc.fit(X_train_SMOTE, y_train_SMOTE)\n", + "\n", + "\n", + "print(lrc.score(X_test, y_test))\n", + "print(lrc.score(X_train_SMOTE, y_train_SMOTE))\n", + "print(lrc.score(X_train_SMOTE, y_train_SMOTE)