diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..665ec89 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -32,7 +32,372 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "import zipfile \n",
+ "import pandas as pd \n",
+ "\n",
+ "with zipfile.ZipFile(\"../archive (3).zip\") as z:\n",
+ " with z.open(\"PS_20174392719_1491204439457_log.csv\") as f:\n",
+ " kaggle_df = pd.read_csv(f)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kaggle_df = kaggle_df.sample(100000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step int64\n",
+ "type object\n",
+ "amount float64\n",
+ "nameOrig object\n",
+ "oldbalanceOrg float64\n",
+ "newbalanceOrig float64\n",
+ "nameDest object\n",
+ "oldbalanceDest float64\n",
+ "newbalanceDest float64\n",
+ "isFraud int64\n",
+ "isFlaggedFraud int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kaggle_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(100000, 11)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kaggle_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " amount | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 100000.000000 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 1.000000e+05 | \n",
+ " 100000.000000 | \n",
+ " 100000.0 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 243.723490 | \n",
+ " 1.845359e+05 | \n",
+ " 8.387364e+05 | \n",
+ " 8.601691e+05 | \n",
+ " 1.115018e+06 | \n",
+ " 1.243651e+06 | \n",
+ " 0.001170 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 142.367774 | \n",
+ " 7.065582e+05 | \n",
+ " 2.888697e+06 | \n",
+ " 2.924544e+06 | \n",
+ " 3.388285e+06 | \n",
+ " 3.773148e+06 | \n",
+ " 0.034185 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 156.000000 | \n",
+ " 1.337234e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 239.000000 | \n",
+ " 7.532524e+04 | \n",
+ " 1.447550e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 1.345713e+05 | \n",
+ " 2.169753e+05 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 336.000000 | \n",
+ " 2.087282e+05 | \n",
+ " 1.077144e+05 | \n",
+ " 1.486264e+05 | \n",
+ " 9.526921e+05 | \n",
+ " 1.116704e+06 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 739.000000 | \n",
+ " 5.367051e+07 | \n",
+ " 3.465715e+07 | \n",
+ " 3.461632e+07 | \n",
+ " 1.919167e+08 | \n",
+ " 2.362512e+08 | \n",
+ " 1.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step amount oldbalanceOrg newbalanceOrig \\\n",
+ "count 100000.000000 1.000000e+05 1.000000e+05 1.000000e+05 \n",
+ "mean 243.723490 1.845359e+05 8.387364e+05 8.601691e+05 \n",
+ "std 142.367774 7.065582e+05 2.888697e+06 2.924544e+06 \n",
+ "min 1.000000 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "25% 156.000000 1.337234e+04 0.000000e+00 0.000000e+00 \n",
+ "50% 239.000000 7.532524e+04 1.447550e+04 0.000000e+00 \n",
+ "75% 336.000000 2.087282e+05 1.077144e+05 1.486264e+05 \n",
+ "max 739.000000 5.367051e+07 3.465715e+07 3.461632e+07 \n",
+ "\n",
+ " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "count 1.000000e+05 1.000000e+05 100000.000000 100000.0 \n",
+ "mean 1.115018e+06 1.243651e+06 0.001170 0.0 \n",
+ "std 3.388285e+06 3.773148e+06 0.034185 0.0 \n",
+ "min 0.000000e+00 0.000000e+00 0.000000 0.0 \n",
+ "25% 0.000000e+00 0.000000e+00 0.000000 0.0 \n",
+ "50% 1.345713e+05 2.169753e+05 0.000000 0.0 \n",
+ "75% 9.526921e+05 1.116704e+06 0.000000 0.0 \n",
+ "max 1.919167e+08 2.362512e+08 1.000000 0.0 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kaggle_df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2110643 | \n",
+ " 183 | \n",
+ " CASH_IN | \n",
+ " 276041.01 | \n",
+ " C148948261 | \n",
+ " 517099.00 | \n",
+ " 793140.01 | \n",
+ " C562863101 | \n",
+ " 40681.95 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1495837 | \n",
+ " 142 | \n",
+ " CASH_IN | \n",
+ " 302753.44 | \n",
+ " C343390956 | \n",
+ " 6220797.90 | \n",
+ " 6523551.34 | \n",
+ " C515132998 | \n",
+ " 15350502.11 | \n",
+ " 15047748.67 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1108056 | \n",
+ " 130 | \n",
+ " CASH_OUT | \n",
+ " 402452.54 | \n",
+ " C77948695 | \n",
+ " 247366.78 | \n",
+ " 0.00 | \n",
+ " C1384245820 | \n",
+ " 826029.96 | \n",
+ " 1228482.50 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3394546 | \n",
+ " 255 | \n",
+ " CASH_OUT | \n",
+ " 3376.90 | \n",
+ " C809201105 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " C1824584041 | \n",
+ " 88984.17 | \n",
+ " 92361.06 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 165619 | \n",
+ " 12 | \n",
+ " CASH_IN | \n",
+ " 87338.92 | \n",
+ " C18630671 | \n",
+ " 16099751.95 | \n",
+ " 16187090.87 | \n",
+ " C1396755641 | \n",
+ " 5018206.33 | \n",
+ " 5505886.84 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n",
+ "2110643 183 CASH_IN 276041.01 C148948261 517099.00 793140.01 \n",
+ "1495837 142 CASH_IN 302753.44 C343390956 6220797.90 6523551.34 \n",
+ "1108056 130 CASH_OUT 402452.54 C77948695 247366.78 0.00 \n",
+ "3394546 255 CASH_OUT 3376.90 C809201105 0.00 0.00 \n",
+ "165619 12 CASH_IN 87338.92 C18630671 16099751.95 16187090.87 \n",
+ "\n",
+ " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n",
+ "2110643 C562863101 40681.95 0.00 0 0 \n",
+ "1495837 C515132998 15350502.11 15047748.67 0 0 \n",
+ "1108056 C1384245820 826029.96 1228482.50 0 0 \n",
+ "3394546 C1824584041 88984.17 92361.06 0 0 \n",
+ "165619 C1396755641 5018206.33 5505886.84 0 0 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kaggle_df.head()"
]
},
{
@@ -44,11 +409,57 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud\n",
+ "1 CASH_IN 80448.13 C161982472 4274305.14 4354753.26 C20671747 124139.21 43691.09 0 0 0.00001\n",
+ "306 PAYMENT 1054.53 C73563022 38874.00 37819.47 M2109867061 0.00 0.00 0 0 0.00001\n",
+ " 2233.57 C644344366 0.00 0.00 M432445649 0.00 0.00 0 0 0.00001\n",
+ " 2195.00 C1797312755 0.00 0.00 M852421074 0.00 0.00 0 0 0.00001\n",
+ " 2186.72 C82894008 311.00 0.00 M698318441 0.00 0.00 0 0 0.00001\n",
+ " ... \n",
+ "183 PAYMENT 9667.83 C274845817 0.00 0.00 M876860114 0.00 0.00 0 0 0.00001\n",
+ " 9494.15 C1114810171 0.00 0.00 M1161440260 0.00 0.00 0 0 0.00001\n",
+ " 9272.14 C1319754951 345925.55 336653.41 M886679806 0.00 0.00 0 0 0.00001\n",
+ " 9181.17 C34770566 0.00 0.00 M1800031522 0.00 0.00 0 0 0.00001\n",
+ "739 CASH_OUT 8116.53 C564539602 8116.53 0.00 C1935865739 7638.26 15754.79 1 0 0.00001\n",
+ "Length: 100000, dtype: float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your response here"
+ "# Your response here\n",
+ "kaggle_df.value_counts(normalize=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0.99883\n",
+ "1 0.00117\n",
+ "Name: isFraud, dtype: float64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kaggle_df.isFraud.value_counts(normalize=True)"
]
},
{
@@ -60,11 +471,25 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "# Your code here\n",
+ "#Yess as long as the step unit represents the same fraction of time \n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "le = LabelEncoder()\n",
+ "label_columns = [\"type\"]\n",
+ "kaggle_df[label_columns] = kaggle_df[label_columns].apply(le.fit_transform)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kaggle_df.drop(labels=['nameDest', 'nameOrig'], axis=1, inplace=True) # GOT AN ERROR ON THE REGRESSION BECAUSE I FORGOT TO REMOVE THESE COLUMNS xD"
]
},
{
@@ -76,11 +501,62 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "Y= kaggle_df[\"isFraud\"]\n",
+ "X= kaggle_df.drop([\"isFraud\"], axis=1)\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Logistic Regression Test : 99.795\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "lr = LogisticRegression()#class_weight='balanced')\n",
+ "lr.fit(X_train, Y_train)\n",
+ "acc = lr.score(X_test, Y_test)*100\n",
+ "\n",
+ "print(\"Logistic Regression Test : \",acc)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "99.795\n",
+ "[[19948 30]\n",
+ " [ 11 11]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import confusion_matrix, accuracy_score\n",
+ "y_pred = lr.predict(X_test)\n",
+ "print(accuracy_score(Y_test, y_pred)*100)\n",
+ "cm = confusion_matrix(Y_test, y_pred)\n",
+ "print(cm)"
]
},
{
@@ -92,11 +568,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Decision Tree Test Accuracy 99.92\n",
+ "[[19968 10]\n",
+ " [ 6 16]]\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "dtc = DecisionTreeClassifier()\n",
+ "dtc.fit(X_train, Y_train)\n",
+ "y_pred = dtc.predict(X_test)\n",
+ "acc = dtc.score(X_test, Y_test)*100\n",
+ "print(\"Decision Tree Test Accuracy \",acc,)\n",
+ "cm = confusion_matrix(Y_test, y_pred)\n",
+ "print(cm)\n"
]
},
{
@@ -108,11 +602,12 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "# Your response here\n",
+ "#the decison tree worked better with an accuravy of 99.92 "
]
},
{
@@ -125,7 +620,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.9.12 ('base')",
"language": "python",
"name": "python3"
},
@@ -139,7 +634,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.12"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
+ }
}
},
"nbformat": 4,