Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 314 additions & 14 deletions your-code/lab_imbalance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,24 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"download = pd.read_csv('/Users/tiagoornelas/Downloads/PS_20174392719_1491204439457_log.csv')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"data = download.sample(100000)"
]
},
{
Expand All @@ -44,11 +57,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 102,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"0 99873\n",
"1 127\n",
"Name: isFraud, dtype: int64"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your response here"
"data['isFraud'].value_counts()"
]
},
{
Expand All @@ -60,11 +86,208 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>step</th>\n",
" <th>type</th>\n",
" <th>amount</th>\n",
" <th>nameOrig</th>\n",
" <th>oldbalanceOrg</th>\n",
" <th>newbalanceOrig</th>\n",
" <th>nameDest</th>\n",
" <th>oldbalanceDest</th>\n",
" <th>newbalanceDest</th>\n",
" <th>isFraud</th>\n",
" <th>isFlaggedFraud</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6193791</th>\n",
" <td>571</td>\n",
" <td>CASH_IN</td>\n",
" <td>117210.89</td>\n",
" <td>C1305827522</td>\n",
" <td>3450017.89</td>\n",
" <td>3567228.79</td>\n",
" <td>C1603305724</td>\n",
" <td>156595.94</td>\n",
" <td>39385.04</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2085534</th>\n",
" <td>182</td>\n",
" <td>PAYMENT</td>\n",
" <td>10490.04</td>\n",
" <td>C1504874999</td>\n",
" <td>18531.00</td>\n",
" <td>8040.96</td>\n",
" <td>M942041713</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1338882</th>\n",
" <td>137</td>\n",
" <td>CASH_OUT</td>\n",
" <td>354353.49</td>\n",
" <td>C1580171723</td>\n",
" <td>828.00</td>\n",
" <td>0.00</td>\n",
" <td>C1086480347</td>\n",
" <td>30216.95</td>\n",
" <td>384570.43</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" step type amount nameOrig oldbalanceOrg \\\n",
"6193791 571 CASH_IN 117210.89 C1305827522 3450017.89 \n",
"2085534 182 PAYMENT 10490.04 C1504874999 18531.00 \n",
"1338882 137 CASH_OUT 354353.49 C1580171723 828.00 \n",
"\n",
" newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
"6193791 3567228.79 C1603305724 156595.94 39385.04 0 \n",
"2085534 8040.96 M942041713 0.00 0.00 0 \n",
"1338882 0.00 C1086480347 30216.95 384570.43 0 \n",
"\n",
" isFlaggedFraud \n",
"6193791 0 \n",
"2085534 0 \n",
"1338882 0 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"step int64\n",
"type object\n",
"amount float64\n",
"nameOrig object\n",
"oldbalanceOrg float64\n",
"newbalanceOrig float64\n",
"nameDest object\n",
"oldbalanceDest float64\n",
"newbalanceDest float64\n",
"isFraud int64\n",
"isFlaggedFraud int64\n",
"dtype: object"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"display(data.head(3))\n",
"data.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"data['days'] = round(data['step'] / 24)\n",
"data['hours'] = data['step'] % 24 \n",
"\n",
"\n",
"def roundup (row):\n",
"\n",
" if row == 0:\n",
" return 1\n",
" else:\n",
" return row\n",
"\n",
"data['days'] = data['days'].apply(roundup)\n"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"def time(row):\n",
"\n",
" day = str(row['days']).replace('0','').replace('.','')\n",
" hour = str(row['hours'])\n",
" date = '01/2022'\n",
" day_hour_month_year = day + '/' + date + ' ' + hour + ':00:00'\n",
"\n",
" return pd.to_datetime(day_hour_month_year, format='%d/%m/%Y %H:%M:%S')\n",
"\n",
"\n",
"data['step'] = data.apply(time, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"import datetime as dt\n",
"\n",
"data['step']=data['step'].map(dt.datetime.toordinal)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"data = data.drop(['days', 'hours', 'nameOrig', 'nameDest'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"# Your code here\n"
"dummies = pd.get_dummies(data['type'], prefix='type')\n",
"\n",
"data = pd.concat([data, dummies], axis=1)\n",
"\n",
"data = data.drop(['type'], axis=1)"
]
},
{
Expand All @@ -76,11 +299,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)\n",
"y = data['isFraud']\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train data accuracy was 0.9993733333333333\n",
"test data accuracy was 0.9994\n"
]
}
],
"source": [
"model = LogisticRegression()\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"print('train data accuracy was', model.score(X_train, y_train))\n",
"print('test data accuracy was', model.score(X_test, y_test))"
]
},
{
Expand All @@ -96,7 +347,51 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"## creating a list of hyperparameters \n",
"\n",
"n_estimators = [10,15,20]\n",
"max_depth = [3,6,10]\n",
"\n",
"\n",
"\n",
"grid = {'n_estimators': n_estimators, 'max_depth': max_depth}\n",
"\n",
"model = RandomForestClassifier()\n",
"\n",
"grid_search = GridSearchCV(estimator=model, param_grid=grid, cv = 5)\n",
"\n",
"grid_search.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.99956"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"forest = RandomForestClassifier(\n",
" n_estimators=10,\n",
" max_depth= 10)\n",
"\n",
"\n",
"forest.fit(X_train, y_train)\n",
"\n",
"forest.score(X_test, y_test)"
]
},
{
Expand All @@ -108,11 +403,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Your response here"
"# after hypertuning random forest classifier, it has a marginally higher accuracy than the logistic regression"
]
},
{
Expand All @@ -125,7 +420,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.9.13 ('ironclasses')",
"language": "python",
"name": "python3"
},
Expand All @@ -139,7 +434,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.9.13"
},
"vscode": {
"interpreter": {
"hash": "5bfe8cd4d37db90c8b9468f14ce817f9cd906576255a34a1d0cefacef45ef3b9"
}
}
},
"nbformat": 4,
Expand Down