diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..21af561 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -28,18 +28,199 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "import pandas as pd"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 4,
"metadata": {},
+ "outputs": [],
"source": [
- "### What is the distribution of the outcome? "
+ "entire_data=pd.read_csv(\"C:/Users/milena.xavier/Downloads/archive (2)/PS_20174392719_1491204439457_log.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample=entire_data.sample(n=100000, random_state=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6322570 | \n",
+ " 688 | \n",
+ " CASH_IN | \n",
+ " 23557.12 | \n",
+ " C867750533 | \n",
+ " 8059.00 | \n",
+ " 31616.12 | \n",
+ " C1026934669 | \n",
+ " 169508.66 | \n",
+ " 145951.53 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3621196 | \n",
+ " 274 | \n",
+ " PAYMENT | \n",
+ " 6236.13 | \n",
+ " C601099070 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " M701283411 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1226256 | \n",
+ " 133 | \n",
+ " PAYMENT | \n",
+ " 33981.87 | \n",
+ " C279540931 | \n",
+ " 18745.72 | \n",
+ " 0.00 | \n",
+ " M577905776 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2803274 | \n",
+ " 225 | \n",
+ " CASH_OUT | \n",
+ " 263006.42 | \n",
+ " C11675531 | \n",
+ " 20072.00 | \n",
+ " 0.00 | \n",
+ " C529577791 | \n",
+ " 390253.56 | \n",
+ " 653259.98 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3201247 | \n",
+ " 249 | \n",
+ " CASH_OUT | \n",
+ " 152013.74 | \n",
+ " C530649214 | \n",
+ " 20765.00 | \n",
+ " 0.00 | \n",
+ " C1304175579 | \n",
+ " 252719.19 | \n",
+ " 404732.93 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n",
+ "6322570 688 CASH_IN 23557.12 C867750533 8059.00 31616.12 \n",
+ "3621196 274 PAYMENT 6236.13 C601099070 0.00 0.00 \n",
+ "1226256 133 PAYMENT 33981.87 C279540931 18745.72 0.00 \n",
+ "2803274 225 CASH_OUT 263006.42 C11675531 20072.00 0.00 \n",
+ "3201247 249 CASH_OUT 152013.74 C530649214 20765.00 0.00 \n",
+ "\n",
+ " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "6322570 C1026934669 169508.66 145951.53 0 0 \n",
+ "3621196 M701283411 0.00 0.00 0 0 \n",
+ "1226256 M577905776 0.00 0.00 0 0 \n",
+ "2803274 C529577791 390253.56 653259.98 0 0 \n",
+ "3201247 C1304175579 252719.19 404732.93 0 0 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step int64\n",
+ "type object\n",
+ "amount float64\n",
+ "nameOrig object\n",
+ "oldbalanceOrg float64\n",
+ "newbalanceOrig float64\n",
+ "nameDest object\n",
+ "oldbalanceDest float64\n",
+ "newbalanceDest float64\n",
+ "isFraud int64\n",
+ "isFlaggedFraud int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample.dtypes"
]
},
{
@@ -48,7 +229,559 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "#there are 3 columns with text. Working on the first one:Type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CASH_OUT 35209\n",
+ "PAYMENT 33694\n",
+ "CASH_IN 21987\n",
+ "TRANSFER 8416\n",
+ "DEBIT 694\n",
+ "Name: type, dtype: int64"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample[\"type\"].value_counts()\n",
+ "#there are only 5 types, I could create dummies with this column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_dummy=pd.get_dummies(sample, columns=['type'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_dummy.drop(columns=\"type\", axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6322570 | \n",
+ " 688 | \n",
+ " 23557.12 | \n",
+ " C867750533 | \n",
+ " 8059.00 | \n",
+ " 31616.12 | \n",
+ " C1026934669 | \n",
+ " 169508.66 | \n",
+ " 145951.53 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3621196 | \n",
+ " 274 | \n",
+ " 6236.13 | \n",
+ " C601099070 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " M701283411 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1226256 | \n",
+ " 133 | \n",
+ " 33981.87 | \n",
+ " C279540931 | \n",
+ " 18745.72 | \n",
+ " 0.00 | \n",
+ " M577905776 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2803274 | \n",
+ " 225 | \n",
+ " 263006.42 | \n",
+ " C11675531 | \n",
+ " 20072.00 | \n",
+ " 0.00 | \n",
+ " C529577791 | \n",
+ " 390253.56 | \n",
+ " 653259.98 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3201247 | \n",
+ " 249 | \n",
+ " 152013.74 | \n",
+ " C530649214 | \n",
+ " 20765.00 | \n",
+ " 0.00 | \n",
+ " C1304175579 | \n",
+ " 252719.19 | \n",
+ " 404732.93 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount nameOrig oldbalanceOrg newbalanceOrig \\\n",
+ "6322570 688 23557.12 C867750533 8059.00 31616.12 \n",
+ "3621196 274 6236.13 C601099070 0.00 0.00 \n",
+ "1226256 133 33981.87 C279540931 18745.72 0.00 \n",
+ "2803274 225 263006.42 C11675531 20072.00 0.00 \n",
+ "3201247 249 152013.74 C530649214 20765.00 0.00 \n",
+ "\n",
+ " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \\\n",
+ "6322570 C1026934669 169508.66 145951.53 0 0 \n",
+ "3621196 M701283411 0.00 0.00 0 0 \n",
+ "1226256 M577905776 0.00 0.00 0 0 \n",
+ "2803274 C529577791 390253.56 653259.98 0 0 \n",
+ "3201247 C1304175579 252719.19 404732.93 0 0 \n",
+ "\n",
+ " type_CASH_IN type_CASH_OUT type_DEBIT type_PAYMENT type_TRANSFER \n",
+ "6322570 1 0 0 0 0 \n",
+ "3621196 0 0 0 1 0 \n",
+ "1226256 0 0 0 1 0 \n",
+ "2803274 0 1 0 0 0 \n",
+ "3201247 0 1 0 0 0 "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Now, the columns NameOrig and NameDest:\n",
+ "sample_dummy[[\"nameOrig\"]].nunique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy[[\"nameDest\"]].nunique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#as we can see, it is impossible to create dummies with those columns since there are many unique values. Thus, I am going to discard them\n",
+ "\n",
+ "sample_dummy.drop(columns=[\"nameDest\", \"nameOrig\"], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6322570 | \n",
+ " 688 | \n",
+ " 23557.12 | \n",
+ " 8059.00 | \n",
+ " 31616.12 | \n",
+ " 169508.66 | \n",
+ " 145951.53 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3621196 | \n",
+ " 274 | \n",
+ " 6236.13 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1226256 | \n",
+ " 133 | \n",
+ " 33981.87 | \n",
+ " 18745.72 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2803274 | \n",
+ " 225 | \n",
+ " 263006.42 | \n",
+ " 20072.00 | \n",
+ " 0.00 | \n",
+ " 390253.56 | \n",
+ " 653259.98 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3201247 | \n",
+ " 249 | \n",
+ " 152013.74 | \n",
+ " 20765.00 | \n",
+ " 0.00 | \n",
+ " 252719.19 | \n",
+ " 404732.93 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n",
+ "6322570 688 23557.12 8059.00 31616.12 169508.66 \n",
+ "3621196 274 6236.13 0.00 0.00 0.00 \n",
+ "1226256 133 33981.87 18745.72 0.00 0.00 \n",
+ "2803274 225 263006.42 20072.00 0.00 390253.56 \n",
+ "3201247 249 152013.74 20765.00 0.00 252719.19 \n",
+ "\n",
+ " newbalanceDest isFraud isFlaggedFraud type_CASH_IN type_CASH_OUT \\\n",
+ "6322570 145951.53 0 0 1 0 \n",
+ "3621196 0.00 0 0 0 0 \n",
+ "1226256 0.00 0 0 0 0 \n",
+ "2803274 653259.98 0 0 0 1 \n",
+ "3201247 404732.93 0 0 0 1 \n",
+ "\n",
+ " type_DEBIT type_PAYMENT type_TRANSFER \n",
+ "6322570 0 0 0 \n",
+ "3621196 0 1 0 \n",
+ "1226256 0 1 0 \n",
+ "2803274 0 0 0 \n",
+ "3201247 0 0 0 "
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 100000\n",
+ "Name: isFlaggedFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy[\"isFlaggedFraud\"].value_counts()\n",
+ "#since there is no relevant data in this columns, no different items, I am going to remove as well"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_dummy.drop(columns=\"isFlaggedFraud\", axis=1,inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step 0\n",
+ "amount 0\n",
+ "oldbalanceOrg 0\n",
+ "newbalanceOrig 0\n",
+ "oldbalanceDest 0\n",
+ "newbalanceDest 0\n",
+ "isFraud 0\n",
+ "type_CASH_IN 0\n",
+ "type_CASH_OUT 0\n",
+ "type_DEBIT 0\n",
+ "type_PAYMENT 0\n",
+ "type_TRANSFER 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Now, the database is ready to be used :)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### What is the distribution of the outcome? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 99876\n",
+ "1 124\n",
+ "Name: isFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_dummy[\"isFraud\"].value_counts()\n",
+ "#The outcome is very unbalanced. Many more unfraud cases"
]
},
{
@@ -64,7 +797,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "#Dataset already cleaned\n"
]
},
{
@@ -76,11 +809,65 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "from sklearn.linear_model import LogisticRegression\n",
+ "X = sample_dummy.drop('isFraud',axis = 1)\n",
+ "y = sample_dummy['isFraud']\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train accuracy score: 0.9981066666666667\n",
+ "Test accuracy score: 0.99836\n"
+ ]
+ }
+ ],
+ "source": [
+ "LR = LogisticRegression(max_iter=1000)\n",
+ "LR.fit(X_train, y_train)\n",
+ "pred = LR.predict(X_test)\n",
+ "\n",
+ "print(\"Train accuracy score: \", LR.score(X_train, y_train))\n",
+ "print(\"Test accuracy score: \", LR.score(X_test, y_test))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[24939, 32],\n",
+ " [ 9, 20]], dtype=int64)"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import confusion_matrix\n",
+ "\n",
+ "pred = LR.predict(X_test)\n",
+ "confusion_matrix(y_test, pred)\n",
+ "\n",
+ "#there are some false positives in my data, 32. Maybe it could be reduced balancing the data better"
]
},
{
@@ -92,11 +879,307 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.utils import resample"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(99876, 12)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(124, 12)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#I am going to pick the same model, but I will work on the unbalanced data\n",
+ "\n",
+ "# separate majority/minority classes\n",
+ "no_fraud = sample_dummy[sample_dummy['isFraud']==0]\n",
+ "yes_fraud = sample_dummy[sample_dummy['isFraud']==1]\n",
+ "\n",
+ "display(no_fraud.shape)\n",
+ "display(yes_fraud.shape)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "# oversample minority\n",
+ "yes_fraud_oversampled = resample(yes_fraud, #<- oversample from here \n",
+ " replace=True, #<- we need replacement, since we don't have enough data otherwise\n",
+ " n_samples = len(no_fraud),#<- make both sets the same size # make the diabetes set equal to the size of no_diabetes\n",
+ " random_state=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(99876, 12)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(99876, 12)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# both sets are now of a reasonable size\n",
+ "display(no_fraud.shape)\n",
+ "display(yes_fraud_oversampled.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " type_CASH_IN | \n",
+ " type_CASH_OUT | \n",
+ " type_DEBIT | \n",
+ " type_PAYMENT | \n",
+ " type_TRANSFER | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6322570 | \n",
+ " 688 | \n",
+ " 23557.12 | \n",
+ " 8059.00 | \n",
+ " 31616.12 | \n",
+ " 169508.66 | \n",
+ " 145951.53 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3621196 | \n",
+ " 274 | \n",
+ " 6236.13 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1226256 | \n",
+ " 133 | \n",
+ " 33981.87 | \n",
+ " 18745.72 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2803274 | \n",
+ " 225 | \n",
+ " 263006.42 | \n",
+ " 20072.00 | \n",
+ " 0.00 | \n",
+ " 390253.56 | \n",
+ " 653259.98 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3201247 | \n",
+ " 249 | \n",
+ " 152013.74 | \n",
+ " 20765.00 | \n",
+ " 0.00 | \n",
+ " 252719.19 | \n",
+ " 404732.93 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig oldbalanceDest \\\n",
+ "6322570 688 23557.12 8059.00 31616.12 169508.66 \n",
+ "3621196 274 6236.13 0.00 0.00 0.00 \n",
+ "1226256 133 33981.87 18745.72 0.00 0.00 \n",
+ "2803274 225 263006.42 20072.00 0.00 390253.56 \n",
+ "3201247 249 152013.74 20765.00 0.00 252719.19 \n",
+ "\n",
+ " newbalanceDest isFraud type_CASH_IN type_CASH_OUT type_DEBIT \\\n",
+ "6322570 145951.53 0 1 0 0 \n",
+ "3621196 0.00 0 0 0 0 \n",
+ "1226256 0.00 0 0 0 0 \n",
+ "2803274 653259.98 0 0 1 0 \n",
+ "3201247 404732.93 0 0 1 0 \n",
+ "\n",
+ " type_PAYMENT type_TRANSFER \n",
+ "6322570 0 0 \n",
+ "3621196 1 0 \n",
+ "1226256 1 0 \n",
+ "2803274 0 0 \n",
+ "3201247 0 0 "
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#applying it into the train data:\n",
+ "train_oversampled = pd.concat([no_fraud,yes_fraud_oversampled])\n",
+ "train_oversampled.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Now, naming X and Y:\n",
+ "\n",
+ "y_train_over = train_oversampled['isFraud'].copy()\n",
+ "X_train_over = train_oversampled.drop('isFraud',axis = 1).copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LR = LogisticRegression(max_iter=1000)\n",
+ "LR.fit(X_train_over, y_train_over)\n",
+ "pred = LR.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train accuracy score: 0.9580666666666666\n",
+ "Test accuracy score: 0.9566\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Train accuracy score: \", LR.score(X_train, y_train))\n",
+ "print(\"Test accuracy score: \", LR.score(X_test, y_test))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[23887, 1084],\n",
+ " [ 1, 28]], dtype=int64)"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pred = LR.predict(X_test)\n",
+ "confusion_matrix(y_test, pred)"
]
},
{
@@ -112,7 +1195,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "#The model that worked better was the first one without resizing the data. I analyzed the false positives and also the score of the model. Both metrics worked better without reshaping my data."
]
},
{
@@ -125,7 +1208,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -139,7 +1222,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.13"
}
},
"nbformat": 4,