From 756081682b4f798394186e408fe6a07e12b723be Mon Sep 17 00:00:00 2001 From: tiago1999 Date: Mon, 22 Aug 2022 23:22:56 +0100 Subject: [PATCH] done --- your-code/lab_imbalance.ipynb | 328 ++++++++++++++++++++++++++++++++-- 1 file changed, 314 insertions(+), 14 deletions(-) diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..a4dcdc7 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -28,11 +28,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "download = pd.read_csv('/Users/tiagoornelas/Downloads/PS_20174392719_1491204439457_log.csv')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "data = download.sample(100000)" ] }, { @@ -44,11 +57,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 99873\n", + "1 127\n", + "Name: isFraud, dtype: int64" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your response here" + "data['isFraud'].value_counts()" ] }, { @@ -60,11 +86,208 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
6193791571CASH_IN117210.89C13058275223450017.893567228.79C1603305724156595.9439385.0400
2085534182PAYMENT10490.04C150487499918531.008040.96M9420417130.000.0000
1338882137CASH_OUT354353.49C1580171723828.000.00C108648034730216.95384570.4300
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "6193791 571 CASH_IN 117210.89 C1305827522 3450017.89 \n", + "2085534 182 PAYMENT 10490.04 C1504874999 18531.00 \n", + "1338882 137 CASH_OUT 354353.49 C1580171723 828.00 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "6193791 3567228.79 C1603305724 156595.94 39385.04 0 \n", + "2085534 8040.96 M942041713 0.00 0.00 0 \n", + "1338882 0.00 C1086480347 30216.95 384570.43 0 \n", + "\n", + " isFlaggedFraud \n", + "6193791 0 \n", + "2085534 0 \n", + "1338882 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "display(data.head(3))\n", + "data.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "data['days'] = round(data['step'] / 24)\n", + "data['hours'] = data['step'] % 24 \n", + "\n", + "\n", + "def roundup (row):\n", + "\n", + " if row == 0:\n", + " return 1\n", + " else:\n", + " return row\n", + "\n", + "data['days'] = data['days'].apply(roundup)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "def time(row):\n", + "\n", + " day = str(row['days']).replace('0','').replace('.','')\n", + " hour = str(row['hours'])\n", + " date = '01/2022'\n", + " day_hour_month_year = day + '/' + date + ' ' + hour + ':00:00'\n", + "\n", + " return pd.to_datetime(day_hour_month_year, format='%d/%m/%Y %H:%M:%S')\n", + "\n", + "\n", + "data['step'] = data.apply(time, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime as dt\n", + "\n", + "data['step']=data['step'].map(dt.datetime.toordinal)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.drop(['days', 'hours', 'nameOrig', 'nameDest'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "dummies = pd.get_dummies(data['type'], prefix='type')\n", + "\n", + "data = pd.concat([data, dummies], axis=1)\n", + "\n", + "data = data.drop(['type'], axis=1)" ] }, { @@ -76,11 +299,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)\n", + "y = data['isFraud']\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train data accuracy was 0.9993733333333333\n", + "test data accuracy was 0.9994\n" + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "\n", + "model.fit(X_train, y_train)\n", + "\n", + "print('train data accuracy was', model.score(X_train, y_train))\n", + "print('test data accuracy was', model.score(X_test, y_test))" ] }, { @@ -96,7 +347,51 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "## creating a list of hyperparameters \n", + "\n", + "n_estimators = [10,15,20]\n", + "max_depth = [3,6,10]\n", + "\n", + "\n", + "\n", + "grid = {'n_estimators': n_estimators, 'max_depth': max_depth}\n", + "\n", + "model = RandomForestClassifier()\n", + "\n", + "grid_search = GridSearchCV(estimator=model, param_grid=grid, cv = 5)\n", + "\n", + "grid_search.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.99956" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "forest = RandomForestClassifier(\n", + " n_estimators=10,\n", + " max_depth= 10)\n", + "\n", + "\n", + "forest.fit(X_train, y_train)\n", + "\n", + "forest.score(X_test, y_test)" ] }, { @@ -108,11 +403,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# after hypertuning random forest classifier, it has a marginally higher accuracy than the logistic regression" ] }, { @@ -125,7 +420,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.13 ('ironclasses')", "language": "python", "name": "python3" }, @@ -139,7 +434,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "5bfe8cd4d37db90c8b9468f14ce817f9cd906576255a34a1d0cefacef45ef3b9" + } } }, "nbformat": 4,