diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
index a3a5359..adc7872 100644
--- a/your-code/lab_imbalance.ipynb
+++ b/your-code/lab_imbalance.ipynb
@@ -32,7 +32,20 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your code here"
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures, MinMaxScaler\n",
+ "import seaborn as sns\n",
+ "\n",
+ "import pandas as pd\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score"
]
},
{
@@ -44,11 +57,153 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "data = pd.read_csv('PS_20174392719_1491204439457_log.csv').sample(100000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(100000, 11)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "step int64\n",
+ "type object\n",
+ "amount float64\n",
+ "nameOrig object\n",
+ "oldbalanceOrg float64\n",
+ "newbalanceOrig float64\n",
+ "nameDest object\n",
+ "oldbalanceDest float64\n",
+ "newbalanceDest float64\n",
+ "isFraud int64\n",
+ "isFlaggedFraud int64\n",
+ "dtype: object"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " step | \n",
+ " type | \n",
+ " amount | \n",
+ " nameOrig | \n",
+ " oldbalanceOrg | \n",
+ " newbalanceOrig | \n",
+ " nameDest | \n",
+ " oldbalanceDest | \n",
+ " newbalanceDest | \n",
+ " isFraud | \n",
+ " isFlaggedFraud | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 4465129 | \n",
+ " 324 | \n",
+ " CASH_OUT | \n",
+ " 668156.34 | \n",
+ " C1251348464 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " C497599109 | \n",
+ " 17157234.83 | \n",
+ " 17825391.17 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5870062 | \n",
+ " 403 | \n",
+ " TRANSFER | \n",
+ " 2544832.46 | \n",
+ " C478732865 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " C1754164558 | \n",
+ " 7158922.07 | \n",
+ " 9703754.54 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5956605 | \n",
+ " 405 | \n",
+ " CASH_OUT | \n",
+ " 209591.50 | \n",
+ " C1733037259 | \n",
+ " 51908.0 | \n",
+ " 0.0 | \n",
+ " C711944103 | \n",
+ " 0.00 | \n",
+ " 209591.50 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " step type amount nameOrig oldbalanceOrg \\\n",
+ "4465129 324 CASH_OUT 668156.34 C1251348464 0.0 \n",
+ "5870062 403 TRANSFER 2544832.46 C478732865 0.0 \n",
+ "5956605 405 CASH_OUT 209591.50 C1733037259 51908.0 \n",
+ "\n",
+ " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n",
+ "4465129 0.0 C497599109 17157234.83 17825391.17 0 \n",
+ "5870062 0.0 C1754164558 7158922.07 9703754.54 0 \n",
+ "5956605 0.0 C711944103 0.00 209591.50 0 \n",
+ "\n",
+ " isFlaggedFraud \n",
+ "4465129 0 \n",
+ "5870062 0 \n",
+ "5956605 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(data.shape)\n",
+ "display(data.dtypes)\n",
+ "display(data.head(3))\n"
]
},
{
@@ -60,27 +215,77 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "y = fraud['isFraud']\n",
+ "X = fraud.drop(labels='isFraud', axis=1)\n",
+ "\n",
+ "\n",
+ "# divide train test\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)\n"
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Run a logisitc regression classifier and evaluate its accuracy."
+ "### Run a logistic regression classifier and evaluate its accuracy."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No samples remaining after dropping missing values.\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "import pandas as pd\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "# Assuming your training data is stored in the X_train DataFrame\n",
+ "# and the target variable is stored in the y_train Series\n",
+ "\n",
+ "# Identify the column(s) with string values\n",
+ "string_columns = X_train.select_dtypes(include='object').columns\n",
+ "\n",
+ "# Convert the string columns to numeric values\n",
+ "for column in string_columns:\n",
+ " X_train[column] = pd.to_numeric(X_train[column], errors='coerce')\n",
+ "\n",
+ "# Drop rows with missing values\n",
+ "X_train.dropna(inplace=True)\n",
+ "y_train = y_train[X_train.index]\n",
+ "\n",
+ "# Check if there are enough samples remaining\n",
+ "if len(X_train) == 0:\n",
+ " print(\"No samples remaining after dropping missing values.\")\n",
+ "else:\n",
+ " # Split the data into training and testing sets\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
+ "\n",
+ " # Fit the logistic regression model\n",
+ " lr = LogisticRegression()\n",
+ " lr.fit(X_train, y_train)\n",
+ "\n",
+ " # Make predictions on the test set\n",
+ " y_pred = lr.predict(X_test)\n",
+ "\n",
+ " # Calculate the accuracy of the model\n",
+ " accuracy = accuracy_score(y_test, y_pred)\n",
+ " print(f\"Logistic Regression Test Accuracy: {accuracy * 100:.2f}%\")\n",
+ "\n"
]
},
{
@@ -92,11 +297,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 40,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[40], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtree\u001b[39;00m \u001b[39mimport\u001b[39;00m DecisionTreeClassifier\n\u001b[0;32m 2\u001b[0m dtc \u001b[39m=\u001b[39m DecisionTreeClassifier()\n\u001b[1;32m----> 3\u001b[0m dtc\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[0;32m 4\u001b[0m y_pred \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 5\u001b[0m acc \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mscore(X_test, y_test)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m\n",
+ "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:889\u001b[0m, in \u001b[0;36mDecisionTreeClassifier.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 859\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mfit\u001b[39m(\u001b[39mself\u001b[39m, X, y, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, check_input\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[0;32m 860\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Build a decision tree classifier from the training set (X, y).\u001b[39;00m\n\u001b[0;32m 861\u001b[0m \n\u001b[0;32m 862\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 886\u001b[0m \u001b[39m Fitted estimator.\u001b[39;00m\n\u001b[0;32m 887\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mfit(\n\u001b[0;32m 890\u001b[0m X,\n\u001b[0;32m 891\u001b[0m y,\n\u001b[0;32m 892\u001b[0m sample_weight\u001b[39m=\u001b[39;49msample_weight,\n\u001b[0;32m 893\u001b[0m check_input\u001b[39m=\u001b[39;49mcheck_input,\n\u001b[0;32m 894\u001b[0m )\n\u001b[0;32m 895\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
+ "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:186\u001b[0m, in \u001b[0;36mBaseDecisionTree.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 184\u001b[0m check_X_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(dtype\u001b[39m=\u001b[39mDTYPE, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsc\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 185\u001b[0m check_y_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(ensure_2d\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m)\n\u001b[1;32m--> 186\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 187\u001b[0m X, y, validate_separately\u001b[39m=\u001b[39;49m(check_X_params, check_y_params)\n\u001b[0;32m 188\u001b[0m )\n\u001b[0;32m 189\u001b[0m \u001b[39mif\u001b[39;00m issparse(X):\n\u001b[0;32m 190\u001b[0m X\u001b[39m.\u001b[39msort_indices()\n",
+ "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\base.py:579\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 577\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_X_params:\n\u001b[0;32m 578\u001b[0m check_X_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params}\n\u001b[1;32m--> 579\u001b[0m X \u001b[39m=\u001b[39m check_array(X, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params)\n\u001b[0;32m 580\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_y_params:\n\u001b[0;32m 581\u001b[0m check_y_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params}\n",
+ "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:931\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n\u001b[0;32m 930\u001b[0m \u001b[39mif\u001b[39;00m n_samples \u001b[39m<\u001b[39m ensure_min_samples:\n\u001b[1;32m--> 931\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 932\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m sample(s) (shape=\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m) while a\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 933\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m minimum of \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m is required\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 934\u001b[0m \u001b[39m%\u001b[39m (n_samples, array\u001b[39m.\u001b[39mshape, ensure_min_samples, context)\n\u001b[0;32m 935\u001b[0m )\n\u001b[0;32m 937\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_features \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m array\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[0;32m 938\u001b[0m n_features \u001b[39m=\u001b[39m array\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\n",
+ "\u001b[1;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier."
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "dtc = DecisionTreeClassifier()\n",
+ "dtc.fit(X_train, y_train)\n",
+ "y_pred = dtc.predict(X_test)\n",
+ "acc = dtc.score(X_test, y_test)*100\n",
+ "print(f\"Decision Tree Test Accuracy {round(acc, 2)}%\")\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "print(cm)\n",
+ "\n",
+ "print('Precision ', cm[1,1]/(cm[0,1] + cm[1,1]))"
]
},
{
@@ -108,11 +338,11 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# Your response here"
+ "# I completely and utterly messed up and don't even know how to fix it even with chatGPT"
]
},
{
@@ -139,7 +369,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.10.9"
}
},
"nbformat": 4,