diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb index a3a5359..adc7872 100644 --- a/your-code/lab_imbalance.ipynb +++ b/your-code/lab_imbalance.ipynb @@ -32,7 +32,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures, MinMaxScaler\n", + "import seaborn as sns\n", + "\n", + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" ] }, { @@ -44,11 +57,153 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "data = pd.read_csv('PS_20174392719_1491204439457_log.csv').sample(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100000, 11)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "step int64\n", + "type object\n", + "amount float64\n", + "nameOrig object\n", + "oldbalanceOrg float64\n", + "newbalanceOrig float64\n", + "nameDest object\n", + "oldbalanceDest float64\n", + "newbalanceDest float64\n", + "isFraud int64\n", + "isFlaggedFraud int64\n", + "dtype: object" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
steptypeamountnameOrigoldbalanceOrgnewbalanceOrignameDestoldbalanceDestnewbalanceDestisFraudisFlaggedFraud
4465129324CASH_OUT668156.34C12513484640.00.0C49759910917157234.8317825391.1700
5870062403TRANSFER2544832.46C4787328650.00.0C17541645587158922.079703754.5400
5956605405CASH_OUT209591.50C173303725951908.00.0C7119441030.00209591.5000
\n", + "
" + ], + "text/plain": [ + " step type amount nameOrig oldbalanceOrg \\\n", + "4465129 324 CASH_OUT 668156.34 C1251348464 0.0 \n", + "5870062 403 TRANSFER 2544832.46 C478732865 0.0 \n", + "5956605 405 CASH_OUT 209591.50 C1733037259 51908.0 \n", + "\n", + " newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud \\\n", + "4465129 0.0 C497599109 17157234.83 17825391.17 0 \n", + "5870062 0.0 C1754164558 7158922.07 9703754.54 0 \n", + "5956605 0.0 C711944103 0.00 209591.50 0 \n", + "\n", + " isFlaggedFraud \n", + "4465129 0 \n", + "5870062 0 \n", + "5956605 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(data.shape)\n", + "display(data.dtypes)\n", + "display(data.head(3))\n" ] }, { @@ -60,27 +215,77 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "# Your code here\n" + "y = fraud['isFraud']\n", + "X = fraud.drop(labels='isFraud', axis=1)\n", + "\n", + "\n", + "# divide train test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)\n" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Run a logisitc regression classifier and evaluate its accuracy." + "### Run a logistic regression classifier and evaluate its accuracy." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No samples remaining after dropping missing values.\n" + ] + } + ], "source": [ - "# Your code here" + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Assuming your training data is stored in the X_train DataFrame\n", + "# and the target variable is stored in the y_train Series\n", + "\n", + "# Identify the column(s) with string values\n", + "string_columns = X_train.select_dtypes(include='object').columns\n", + "\n", + "# Convert the string columns to numeric values\n", + "for column in string_columns:\n", + " X_train[column] = pd.to_numeric(X_train[column], errors='coerce')\n", + "\n", + "# Drop rows with missing values\n", + "X_train.dropna(inplace=True)\n", + "y_train = y_train[X_train.index]\n", + "\n", + "# Check if there are enough samples remaining\n", + "if len(X_train) == 0:\n", + " print(\"No samples remaining after dropping missing values.\")\n", + "else:\n", + " # Split the data into training and testing sets\n", + " X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n", + "\n", + " # Fit the logistic regression model\n", + " lr = LogisticRegression()\n", + " lr.fit(X_train, y_train)\n", + "\n", + " # Make predictions on the test set\n", + " y_pred = lr.predict(X_test)\n", + "\n", + " # Calculate the accuracy of the model\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " print(f\"Logistic Regression Test Accuracy: {accuracy * 100:.2f}%\")\n", + "\n" ] }, { @@ -92,11 +297,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ValueError", + "evalue": "Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[40], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtree\u001b[39;00m \u001b[39mimport\u001b[39;00m DecisionTreeClassifier\n\u001b[0;32m 2\u001b[0m dtc \u001b[39m=\u001b[39m DecisionTreeClassifier()\n\u001b[1;32m----> 3\u001b[0m dtc\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[0;32m 4\u001b[0m y_pred \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 5\u001b[0m acc \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mscore(X_test, y_test)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:889\u001b[0m, in \u001b[0;36mDecisionTreeClassifier.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 859\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mfit\u001b[39m(\u001b[39mself\u001b[39m, X, y, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, check_input\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[0;32m 860\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Build a decision tree classifier from the training set (X, y).\u001b[39;00m\n\u001b[0;32m 861\u001b[0m \n\u001b[0;32m 862\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 886\u001b[0m \u001b[39m Fitted estimator.\u001b[39;00m\n\u001b[0;32m 887\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mfit(\n\u001b[0;32m 890\u001b[0m X,\n\u001b[0;32m 891\u001b[0m y,\n\u001b[0;32m 892\u001b[0m sample_weight\u001b[39m=\u001b[39;49msample_weight,\n\u001b[0;32m 893\u001b[0m check_input\u001b[39m=\u001b[39;49mcheck_input,\n\u001b[0;32m 894\u001b[0m )\n\u001b[0;32m 895\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:186\u001b[0m, in \u001b[0;36mBaseDecisionTree.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m 184\u001b[0m check_X_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(dtype\u001b[39m=\u001b[39mDTYPE, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsc\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 185\u001b[0m check_y_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(ensure_2d\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m)\n\u001b[1;32m--> 186\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 187\u001b[0m X, y, validate_separately\u001b[39m=\u001b[39;49m(check_X_params, check_y_params)\n\u001b[0;32m 188\u001b[0m )\n\u001b[0;32m 189\u001b[0m \u001b[39mif\u001b[39;00m issparse(X):\n\u001b[0;32m 190\u001b[0m X\u001b[39m.\u001b[39msort_indices()\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\base.py:579\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 577\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_X_params:\n\u001b[0;32m 578\u001b[0m check_X_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params}\n\u001b[1;32m--> 579\u001b[0m X \u001b[39m=\u001b[39m check_array(X, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params)\n\u001b[0;32m 580\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_y_params:\n\u001b[0;32m 581\u001b[0m check_y_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params}\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:931\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n\u001b[0;32m 930\u001b[0m \u001b[39mif\u001b[39;00m n_samples \u001b[39m<\u001b[39m ensure_min_samples:\n\u001b[1;32m--> 931\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 932\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m sample(s) (shape=\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m) while a\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 933\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m minimum of \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m is required\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 934\u001b[0m \u001b[39m%\u001b[39m (n_samples, array\u001b[39m.\u001b[39mshape, ensure_min_samples, context)\n\u001b[0;32m 935\u001b[0m )\n\u001b[0;32m 937\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_features \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m array\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[0;32m 938\u001b[0m n_features \u001b[39m=\u001b[39m array\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\n", + "\u001b[1;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier." + ] + } + ], "source": [ - "# Your code here" + "from sklearn.tree import DecisionTreeClassifier\n", + "dtc = DecisionTreeClassifier()\n", + "dtc.fit(X_train, y_train)\n", + "y_pred = dtc.predict(X_test)\n", + "acc = dtc.score(X_test, y_test)*100\n", + "print(f\"Decision Tree Test Accuracy {round(acc, 2)}%\")\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)\n", + "\n", + "print('Precision ', cm[1,1]/(cm[0,1] + cm[1,1]))" ] }, { @@ -108,11 +338,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Your response here" + "# I completely and utterly messed up and don't even know how to fix it even with chatGPT" ] }, { @@ -139,7 +369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.10.9" } }, "nbformat": 4,