diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..665ec89 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -32,7 +32,372 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Your code here\n", + "import zipfile \n", + "import pandas as pd \n", + "\n", + "with zipfile.ZipFile(\"../archive (3).zip\") as z:\n", + " with z.open(\"PS_20174392719_1491204439457_log.csv\") as f:\n", + " kaggle_df = pd.read_csv(f)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "kaggle_df = kaggle_df.sample(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kaggle_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100000, 11)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kaggle_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stepamountoldbalanceOrgnewbalanceOrigoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
count100000.0000001.000000e+051.000000e+051.000000e+051.000000e+051.000000e+05100000.000000100000.0
mean243.7234901.845359e+058.387364e+058.601691e+051.115018e+061.243651e+060.0011700.0
std142.3677747.065582e+052.888697e+062.924544e+063.388285e+063.773148e+060.0341850.0
min1.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
25%156.0000001.337234e+040.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0
50%239.0000007.532524e+041.447550e+040.000000e+001.345713e+052.169753e+050.0000000.0
75%336.0000002.087282e+051.077144e+051.486264e+059.526921e+051.116704e+060.0000000.0
max739.0000005.367051e+073.465715e+073.461632e+071.919167e+082.362512e+081.0000000.0
\n", + "
" + ], + "text/plain": [ + " step amount oldbalanceOrg newbalanceOrig \\\n", + "count 100000.000000 1.000000e+05 1.000000e+05 1.000000e+05 \n", + "mean 243.723490 1.845359e+05 8.387364e+05 8.601691e+05 \n", + "std 142.367774 7.065582e+05 2.888697e+06 2.924544e+06 \n", + "min 1.000000 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "25% 156.000000 1.337234e+04 0.000000e+00 0.000000e+00 \n", + "50% 239.000000 7.532524e+04 1.447550e+04 0.000000e+00 \n", + "75% 336.000000 2.087282e+05 1.077144e+05 1.486264e+05 \n", + "max 739.000000 5.367051e+07 3.465715e+07 3.461632e+07 \n", + "\n", + " oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "count 1.000000e+05 1.000000e+05 100000.000000 100000.0 \n", + "mean 1.115018e+06 1.243651e+06 0.001170 0.0 \n", + "std 3.388285e+06 3.773148e+06 0.034185 0.0 \n", + "min 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "25% 0.000000e+00 0.000000e+00 0.000000 0.0 \n", + "50% 1.345713e+05 2.169753e+05 0.000000 0.0 \n", + "75% 9.526921e+05 1.116704e+06 0.000000 0.0 \n", + "max 1.919167e+08 2.362512e+08 1.000000 0.0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kaggle_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
2110643183CASH_IN276041.01C148948261517099.00793140.01C56286310140681.950.0000
1495837142CASH_IN302753.44C3433909566220797.906523551.34C51513299815350502.1115047748.6700
1108056130CASH_OUT402452.54C77948695247366.780.00C1384245820826029.961228482.5000
3394546255CASH_OUT3376.90C8092011050.000.00C182458404188984.1792361.0600
16561912CASH_IN87338.92C1863067116099751.9516187090.87C13967556415018206.335505886.8400
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n", + "2110643 183 CASH_IN 276041.01 C148948261 517099.00 793140.01 \n", + "1495837 142 CASH_IN 302753.44 C343390956 6220797.90 6523551.34 \n", + "1108056 130 CASH_OUT 402452.54 C77948695 247366.78 0.00 \n", + "3394546 255 CASH_OUT 3376.90 C809201105 0.00 0.00 \n", + "165619 12 CASH_IN 87338.92 C18630671 16099751.95 16187090.87 \n", + "\n", + " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "2110643 C562863101 40681.95 0.00 0 0 \n", + "1495837 C515132998 15350502.11 15047748.67 0 0 \n", + "1108056 C1384245820 826029.96 1228482.50 0 0 \n", + "3394546 C1824584041 88984.17 92361.06 0 0 \n", + "165619 C1396755641 5018206.33 5505886.84 0 0 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kaggle_df.head()" ] }, { @@ -44,11 +409,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud\n", + "1 CASH_IN 80448.13 C161982472 4274305.14 4354753.26 C20671747 124139.21 43691.09 0 0 0.00001\n", + "306 PAYMENT 1054.53 C73563022 38874.00 37819.47 M2109867061 0.00 0.00 0 0 0.00001\n", + " 2233.57 C644344366 0.00 0.00 M432445649 0.00 0.00 0 0 0.00001\n", + " 2195.00 C1797312755 0.00 0.00 M852421074 0.00 0.00 0 0 0.00001\n", + " 2186.72 C82894008 311.00 0.00 M698318441 0.00 0.00 0 0 0.00001\n", + " ... \n", + "183 PAYMENT 9667.83 C274845817 0.00 0.00 M876860114 0.00 0.00 0 0 0.00001\n", + " 9494.15 C1114810171 0.00 0.00 M1161440260 0.00 0.00 0 0 0.00001\n", + " 9272.14 C1319754951 345925.55 336653.41 M886679806 0.00 0.00 0 0 0.00001\n", + " 9181.17 C34770566 0.00 0.00 M1800031522 0.00 0.00 0 0 0.00001\n", + "739 CASH_OUT 8116.53 C564539602 8116.53 0.00 C1935865739 7638.26 15754.79 1 0 0.00001\n", + "Length: 100000, dtype: float64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your response here" + "# Your response here\n", + "kaggle_df.value_counts(normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.99883\n", + "1 0.00117\n", + "Name: isFraud, dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kaggle_df.isFraud.value_counts(normalize=True)" ] }, { @@ -60,11 +471,25 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "# Your code here\n", + "#Yess as long as the step unit represents the same fraction of time \n", + "from sklearn.preprocessing import LabelEncoder\n", + "le = LabelEncoder()\n", + "label_columns = [\"type\"]\n", + "kaggle_df[label_columns] = kaggle_df[label_columns].apply(le.fit_transform)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "kaggle_df.drop(labels=['nameDest', 'nameOrig'], axis=1, inplace=True) # GOT AN ERROR ON THE REGRESSION BECAUSE I FORGOT TO REMOVE THESE COLUMNS xD" ] }, { @@ -76,11 +501,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Your code here\n", + "Y= kaggle_df[\"isFraud\"]\n", + "X= kaggle_df.drop([\"isFraud\"], axis=1)\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression Test : 99.795\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "lr = LogisticRegression()#class_weight='balanced')\n", + "lr.fit(X_train, Y_train)\n", + "acc = lr.score(X_test, Y_test)*100\n", + "\n", + "print(\"Logistic Regression Test : \",acc)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "99.795\n", + "[[19948 30]\n", + " [ 11 11]]\n" + ] + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix, accuracy_score\n", + "y_pred = lr.predict(X_test)\n", + "print(accuracy_score(Y_test, y_pred)*100)\n", + "cm = confusion_matrix(Y_test, y_pred)\n", + "print(cm)" ] }, { @@ -92,11 +568,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree Test Accuracy 99.92\n", + "[[19968 10]\n", + " [ 6 16]]\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "dtc = DecisionTreeClassifier()\n", + "dtc.fit(X_train, Y_train)\n", + "y_pred = dtc.predict(X_test)\n", + "acc = dtc.score(X_test, Y_test)*100\n", + "print(\"Decision Tree Test Accuracy \",acc,)\n", + "cm = confusion_matrix(Y_test, y_pred)\n", + "print(cm)\n" ] }, { @@ -108,11 +602,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# Your response here\n", + "#the decison tree worked better with an accuravy of 99.92 " ] }, { @@ -125,7 +620,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.12 ('base')", "language": "python", "name": "python3" }, @@ -139,7 +634,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186" + } } }, "nbformat": 4,