diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..37f2128 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,290 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "finance_dataset = pd.read_csv(r'C:\\Users\\Acer\\OneDrive\\Desktop\\Labs\\PS_20174392719_1491204439457_log.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "data = finance_dataset.head(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
01PAYMENT9839.64C1231006815170136.0160296.36M19797871550.000.0000
11PAYMENT1864.28C166654429521249.019384.72M20442822250.000.0000
21TRANSFER181.00C1305486145181.00.00C5532640650.000.0010
31CASH_OUT181.00C840083671181.00.00C3899701021182.000.0010
41PAYMENT11668.14C204853772041554.029885.86M12307017030.000.0000
....................................
9999510PAYMENT4020.66C1410794718159929.0155908.34M12570365760.000.0000
9999610PAYMENT18345.49C7443036776206.00.00M17853445560.000.0000
9999710CASH_IN183774.91C10433185139173.0222947.91C3639288954925.050.0000
9999810CASH_OUT82237.17C7076629666031.00.00C1553004158592635.66799140.4600
9999910PAYMENT20096.56C1868032458110117.090020.44M14192018860.000.0000
\n", + "

100000 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg newbalanceOrig \\\n", + "0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 \n", + "1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 \n", + "2 1 TRANSFER 181.00 C1305486145 181.0 0.00 \n", + "3 1 CASH_OUT 181.00 C840083671 181.0 0.00 \n", + "4 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 \n", + "... ... ... ... ... ... ... \n", + "99995 10 PAYMENT 4020.66 C1410794718 159929.0 155908.34 \n", + "99996 10 PAYMENT 18345.49 C744303677 6206.0 0.00 \n", + "99997 10 CASH_IN 183774.91 C104331851 39173.0 222947.91 \n", + "99998 10 CASH_OUT 82237.17 C707662966 6031.0 0.00 \n", + "99999 10 PAYMENT 20096.56 C1868032458 110117.0 90020.44 \n", + "\n", + " nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud \n", + "0 M1979787155 0.00 0.00 0 0 \n", + "1 M2044282225 0.00 0.00 0 0 \n", + "2 C553264065 0.00 0.00 1 0 \n", + "3 C38997010 21182.00 0.00 1 0 \n", + "4 M1230701703 0.00 0.00 0 0 \n", + "... ... ... ... ... ... \n", + "99995 M1257036576 0.00 0.00 0 0 \n", + "99996 M1785344556 0.00 0.00 0 0 \n", + "99997 C36392889 54925.05 0.00 0 0 \n", + "99998 C1553004158 592635.66 799140.46 0 0 \n", + "99999 M1419201886 0.00 0.00 0 0 \n", + "\n", + "[100000 rows x 11 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0], dtype=int64)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['isFlaggedFraud'].unique()" ] }, { @@ -44,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -60,11 +339,93 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step 0\n", + "type 0\n", + "amount 0\n", + "nameOrig 0\n", + "oldbalanceOrg 0\n", + "newbalanceOrig 0\n", + "nameDest 0\n", + "oldbalanceDest 0\n", + "newbalanceDest 0\n", + "isFraud 0\n", + "isFlaggedFraud 0\n", + "dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "##data['isFraud'] = data['isFraud'].astype('object')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=int64)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['step'].unique()" ] }, { @@ -76,11 +437,114 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "## this is a classification problem\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.get_dummies(data, columns=['type'], prefix=['type'])" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "X = data.drop(columns='isFraud')\n", + "y = data['isFraud']" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "lr = LogisticRegression()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Acer\\anaconda3\\envs\\ClassJuly7\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "data": { + "text/html": [ + "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression()" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data accuracy was 0.99908\n", + "Train data accuracy was 0.99888\n" + ] + } + ], + "source": [ + "print('Test data accuracy was', lr.score(X_test,y_test))\n", + "print('Train data accuracy was', lr.score(X_train,y_train))" ] }, { @@ -92,27 +556,88 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "X = data.drop(columns='isFraud')\n", + "y = data['isFraud']" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 61, "metadata": {}, + "outputs": [], "source": [ - "### Which model worked better and how do you know?" + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rd = RandomForestClassifier()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rd.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data accuracy was 0.99916\n", + "Train data accuracy was 0.9999866666666667\n" + ] + } + ], + "source": [ + "print('Test data accuracy was', rd.score(X_test,y_test))\n", + "print('Train data accuracy was', rd.score(X_train,y_train))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Which model worked better and how do you know?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The logistic regression fits better because the model I seleted is overfiting which is not good for analyxzing the dataset." ] }, { @@ -125,7 +650,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.13 ('ClassJuly7')", "language": "python", "name": "python3" }, @@ -139,7 +664,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "26ce022d4acb40739b66637a34781e557fcd0150734cbabfddd6b99484a439c0" + } } }, "nbformat": 4,