diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..a4dcdc7 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -28,11 +28,24 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "download = pd.read_csv('/Users/tiagoornelas/Downloads/PS_20174392719_1491204439457_log.csv')\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = download.sample(100000)"
]
},
{
@@ -44,11 +57,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 102,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 99873\n",
+ "1 127\n",
+ "Name: isFraud, dtype: int64"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your response here"
+ "data['isFraud'].value_counts()"
]
},
{
@@ -60,11 +86,208 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6193791 | \n",
+ " 571 | \n",
+ " CASH_IN | \n",
+ " 117210.89 | \n",
+ " C1305827522 | \n",
+ " 3450017.89 | \n",
+ " 3567228.79 | \n",
+ " C1603305724 | \n",
+ " 156595.94 | \n",
+ " 39385.04 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2085534 | \n",
+ " 182 | \n",
+ " PAYMENT | \n",
+ " 10490.04 | \n",
+ " C1504874999 | \n",
+ " 18531.00 | \n",
+ " 8040.96 | \n",
+ " M942041713 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1338882 | \n",
+ " 137 | \n",
+ " CASH_OUT | \n",
+ " 354353.49 | \n",
+ " C1580171723 | \n",
+ " 828.00 | \n",
+ " 0.00 | \n",
+ " C1086480347 | \n",
+ " 30216.95 | \n",
+ " 384570.43 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg \\\n",
+ "6193791 571 CASH_IN 117210.89 C1305827522 3450017.89 \n",
+ "2085534 182 PAYMENT 10490.04 C1504874999 18531.00 \n",
+ "1338882 137 CASH_OUT 354353.49 C1580171723 828.00 \n",
+ "\n",
+ " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
+ "6193791 3567228.79 C1603305724 156595.94 39385.04 0 \n",
+ "2085534 8040.96 M942041713 0.00 0.00 0 \n",
+ "1338882 0.00 C1086480347 30216.95 384570.43 0 \n",
+ "\n",
+ " isFlaggedFraud \n",
+ "6193791 0 \n",
+ "2085534 0 \n",
+ "1338882 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "step int64\n",
+ "type object\n",
+ "amount float64\n",
+ "nameOrig object\n",
+ "oldbalanceOrg float64\n",
+ "newbalanceOrig float64\n",
+ "nameDest object\n",
+ "oldbalanceDest float64\n",
+ "newbalanceDest float64\n",
+ "isFraud int64\n",
+ "isFlaggedFraud int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "display(data.head(3))\n",
+ "data.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data['days'] = round(data['step'] / 24)\n",
+ "data['hours'] = data['step'] % 24 \n",
+ "\n",
+ "\n",
+ "def roundup (row):\n",
+ "\n",
+ " if row == 0:\n",
+ " return 1\n",
+ " else:\n",
+ " return row\n",
+ "\n",
+ "data['days'] = data['days'].apply(roundup)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def time(row):\n",
+ "\n",
+ " day = str(row['days']).replace('0','').replace('.','')\n",
+ " hour = str(row['hours'])\n",
+ " date = '01/2022'\n",
+ " day_hour_month_year = day + '/' + date + ' ' + hour + ':00:00'\n",
+ "\n",
+ " return pd.to_datetime(day_hour_month_year, format='%d/%m/%Y %H:%M:%S')\n",
+ "\n",
+ "\n",
+ "data['step'] = data.apply(time, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datetime as dt\n",
+ "\n",
+ "data['step']=data['step'].map(dt.datetime.toordinal)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = data.drop(['days', 'hours', 'nameOrig', 'nameDest'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "dummies = pd.get_dummies(data['type'], prefix='type')\n",
+ "\n",
+ "data = pd.concat([data, dummies], axis=1)\n",
+ "\n",
+ "data = data.drop(['type'], axis=1)"
]
},
{
@@ -76,11 +299,39 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)\n",
+ "y = data['isFraud']\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X,y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train data accuracy was 0.9993733333333333\n",
+ "test data accuracy was 0.9994\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = LogisticRegression()\n",
+ "\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "print('train data accuracy was', model.score(X_train, y_train))\n",
+ "print('test data accuracy was', model.score(X_test, y_test))"
]
},
{
@@ -96,7 +347,51 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "\n",
+ "## creating a list of hyperparameters \n",
+ "\n",
+ "n_estimators = [10,15,20]\n",
+ "max_depth = [3,6,10]\n",
+ "\n",
+ "\n",
+ "\n",
+ "grid = {'n_estimators': n_estimators, 'max_depth': max_depth}\n",
+ "\n",
+ "model = RandomForestClassifier()\n",
+ "\n",
+ "grid_search = GridSearchCV(estimator=model, param_grid=grid, cv = 5)\n",
+ "\n",
+ "grid_search.fit(X_train, y_train)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.99956"
+ ]
+ },
+ "execution_count": 131,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "forest = RandomForestClassifier(\n",
+ " n_estimators=10,\n",
+ " max_depth= 10)\n",
+ "\n",
+ "\n",
+ "forest.fit(X_train, y_train)\n",
+ "\n",
+ "forest.score(X_test, y_test)"
]
},
{
@@ -108,11 +403,11 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "# after hypertuning random forest classifier, it has a marginally higher accuracy than the logistic regression"
]
},
{
@@ -125,7 +420,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.9.13 ('ironclasses')",
"language": "python",
"name": "python3"
},
@@ -139,7 +434,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "5bfe8cd4d37db90c8b9468f14ce817f9cd906576255a34a1d0cefacef45ef3b9"
+ }
}
},
"nbformat": 4,