ta-data-lis · JavierIronhack · Jun 6, 2023
diff --git a/your-code/lab_imbalance.ipynb b/your-code/lab_imbalance.ipynb
@@ -32,7 +32,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here"
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures, MinMaxScaler\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score"
    ]
   },
   {
@@ -44,11 +57,153 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your response here"
+    "data = pd.read_csv('PS_20174392719_1491204439457_log.csv').sample(100000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100000, 11)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "step                int64\n",
+       "type               object\n",
+       "amount            float64\n",
+       "nameOrig           object\n",
+       "oldbalanceOrg     float64\n",
+       "newbalanceOrig    float64\n",
+       "nameDest           object\n",
+       "oldbalanceDest    float64\n",
+       "newbalanceDest    float64\n",
+       "isFraud             int64\n",
+       "isFlaggedFraud      int64\n",
+       "dtype: object"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>step</th>\n",
+       "      <th>type</th>\n",
+       "      <th>amount</th>\n",
+       "      <th>nameOrig</th>\n",
+       "      <th>oldbalanceOrg</th>\n",
+       "      <th>newbalanceOrig</th>\n",
+       "      <th>nameDest</th>\n",
+       "      <th>oldbalanceDest</th>\n",
+       "      <th>newbalanceDest</th>\n",
+       "      <th>isFraud</th>\n",
+       "      <th>isFlaggedFraud</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4465129</th>\n",
+       "      <td>324</td>\n",
+       "      <td>CASH_OUT</td>\n",
+       "      <td>668156.34</td>\n",
+       "      <td>C1251348464</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>C497599109</td>\n",
+       "      <td>17157234.83</td>\n",
+       "      <td>17825391.17</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5870062</th>\n",
+       "      <td>403</td>\n",
+       "      <td>TRANSFER</td>\n",
+       "      <td>2544832.46</td>\n",
+       "      <td>C478732865</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>C1754164558</td>\n",
+       "      <td>7158922.07</td>\n",
+       "      <td>9703754.54</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5956605</th>\n",
+       "      <td>405</td>\n",
+       "      <td>CASH_OUT</td>\n",
+       "      <td>209591.50</td>\n",
+       "      <td>C1733037259</td>\n",
+       "      <td>51908.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>C711944103</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>209591.50</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         step      type      amount     nameOrig  oldbalanceOrg  \\\n",
+       "4465129   324  CASH_OUT   668156.34  C1251348464            0.0   \n",
+       "5870062   403  TRANSFER  2544832.46   C478732865            0.0   \n",
+       "5956605   405  CASH_OUT   209591.50  C1733037259        51908.0   \n",
+       "\n",
+       "         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \\\n",
+       "4465129             0.0   C497599109     17157234.83     17825391.17        0   \n",
+       "5870062             0.0  C1754164558      7158922.07      9703754.54        0   \n",
+       "5956605             0.0   C711944103            0.00       209591.50        0   \n",
+       "\n",
+       "         isFlaggedFraud  \n",
+       "4465129               0  \n",
+       "5870062               0  \n",
+       "5956605               0  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(data.shape)\n",
+    "display(data.dtypes)\n",
+    "display(data.head(3))\n"
    ]
   },
   {
@@ -60,27 +215,77 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here\n"
+    "y = fraud['isFraud']\n",
+    "X = fraud.drop(labels='isFraud', axis=1)\n",
+    "\n",
+    "\n",
+    "# divide train test\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Run a logisitc regression classifier and evaluate its accuracy."
+    "### Run a logistic regression classifier and evaluate its accuracy."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No samples remaining after dropping missing values.\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code here"
+    "import pandas as pd\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "# Assuming your training data is stored in the X_train DataFrame\n",
+    "# and the target variable is stored in the y_train Series\n",
+    "\n",
+    "# Identify the column(s) with string values\n",
+    "string_columns = X_train.select_dtypes(include='object').columns\n",
+    "\n",
+    "# Convert the string columns to numeric values\n",
+    "for column in string_columns:\n",
+    "    X_train[column] = pd.to_numeric(X_train[column], errors='coerce')\n",
+    "\n",
+    "# Drop rows with missing values\n",
+    "X_train.dropna(inplace=True)\n",
+    "y_train = y_train[X_train.index]\n",
+    "\n",
+    "# Check if there are enough samples remaining\n",
+    "if len(X_train) == 0:\n",
+    "    print(\"No samples remaining after dropping missing values.\")\n",
+    "else:\n",
+    "    # Split the data into training and testing sets\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n",
+    "\n",
+    "    # Fit the logistic regression model\n",
+    "    lr = LogisticRegression()\n",
+    "    lr.fit(X_train, y_train)\n",
+    "\n",
+    "    # Make predictions on the test set\n",
+    "    y_pred = lr.predict(X_test)\n",
+    "\n",
+    "    # Calculate the accuracy of the model\n",
+    "    accuracy = accuracy_score(y_test, y_pred)\n",
+    "    print(f\"Logistic Regression Test Accuracy: {accuracy * 100:.2f}%\")\n",
+    "\n"
    ]
   },
   {
@@ -92,11 +297,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 40,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[40], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtree\u001b[39;00m \u001b[39mimport\u001b[39;00m DecisionTreeClassifier\n\u001b[0;32m      2\u001b[0m dtc \u001b[39m=\u001b[39m DecisionTreeClassifier()\n\u001b[1;32m----> 3\u001b[0m dtc\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[0;32m      4\u001b[0m y_pred \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m      5\u001b[0m acc \u001b[39m=\u001b[39m dtc\u001b[39m.\u001b[39mscore(X_test, y_test)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m\n",
+      "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:889\u001b[0m, in \u001b[0;36mDecisionTreeClassifier.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m    859\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mfit\u001b[39m(\u001b[39mself\u001b[39m, X, y, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, check_input\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[0;32m    860\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Build a decision tree classifier from the training set (X, y).\u001b[39;00m\n\u001b[0;32m    861\u001b[0m \n\u001b[0;32m    862\u001b[0m \u001b[39m    Parameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    886\u001b[0m \u001b[39m        Fitted estimator.\u001b[39;00m\n\u001b[0;32m    887\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m     \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mfit(\n\u001b[0;32m    890\u001b[0m         X,\n\u001b[0;32m    891\u001b[0m         y,\n\u001b[0;32m    892\u001b[0m         sample_weight\u001b[39m=\u001b[39;49msample_weight,\n\u001b[0;32m    893\u001b[0m         check_input\u001b[39m=\u001b[39;49mcheck_input,\n\u001b[0;32m    894\u001b[0m     )\n\u001b[0;32m    895\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
+      "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py:186\u001b[0m, in \u001b[0;36mBaseDecisionTree.fit\u001b[1;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[0;32m    184\u001b[0m check_X_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(dtype\u001b[39m=\u001b[39mDTYPE, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsc\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m    185\u001b[0m check_y_params \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(ensure_2d\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m)\n\u001b[1;32m--> 186\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m    187\u001b[0m     X, y, validate_separately\u001b[39m=\u001b[39;49m(check_X_params, check_y_params)\n\u001b[0;32m    188\u001b[0m )\n\u001b[0;32m    189\u001b[0m \u001b[39mif\u001b[39;00m issparse(X):\n\u001b[0;32m    190\u001b[0m     X\u001b[39m.\u001b[39msort_indices()\n",
+      "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\base.py:579\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m    577\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_X_params:\n\u001b[0;32m    578\u001b[0m     check_X_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params}\n\u001b[1;32m--> 579\u001b[0m X \u001b[39m=\u001b[39m check_array(X, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_X_params)\n\u001b[0;32m    580\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mestimator\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m check_y_params:\n\u001b[0;32m    581\u001b[0m     check_y_params \u001b[39m=\u001b[39m {\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdefault_check_params, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params}\n",
+      "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:931\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m    929\u001b[0m     n_samples \u001b[39m=\u001b[39m _num_samples(array)\n\u001b[0;32m    930\u001b[0m     \u001b[39mif\u001b[39;00m n_samples \u001b[39m<\u001b[39m ensure_min_samples:\n\u001b[1;32m--> 931\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    932\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mFound array with \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m sample(s) (shape=\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m) while a\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    933\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39m minimum of \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m is required\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    934\u001b[0m             \u001b[39m%\u001b[39m (n_samples, array\u001b[39m.\u001b[39mshape, ensure_min_samples, context)\n\u001b[0;32m    935\u001b[0m         )\n\u001b[0;32m    937\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_features \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m array\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[0;32m    938\u001b[0m     n_features \u001b[39m=\u001b[39m array\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\n",
+      "\u001b[1;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by DecisionTreeClassifier."
+     ]
+    }
+   ],
    "source": [
-    "# Your code here"
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "dtc = DecisionTreeClassifier()\n",
+    "dtc.fit(X_train, y_train)\n",
+    "y_pred = dtc.predict(X_test)\n",
+    "acc = dtc.score(X_test, y_test)*100\n",
+    "print(f\"Decision Tree Test Accuracy {round(acc, 2)}%\")\n",
+    "cm = confusion_matrix(y_test, y_pred)\n",
+    "print(cm)\n",
+    "\n",
+    "print('Precision ', cm[1,1]/(cm[0,1] + cm[1,1]))"
    ]
   },
   {
@@ -108,11 +338,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your response here"
+    "# I completely and utterly messed up and don't even know how to fix it even with chatGPT"
    ]
   },
   {
@@ -139,7 +369,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,