ironhack-labs · DavidMartins92 · Nov 13, 2023
diff --git a/LAB customer analysis Round 5 DAVID MARTINS.ipynb b/LAB customer analysis Round 5 DAVID MARTINS.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "cd198984",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid character '´' (U+00B4) (917027426.py, line 1)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;36m  Cell \u001b[1;32mIn[47], line 1\u001b[1;36m\u001b[0m\n\u001b[1;33m    ´# These are the normal libraries\u001b[0m\n\u001b[1;37m    ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid character '´' (U+00B4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "´# These are the normal libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# This is just so that we don't get annoying warnings\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# This is the most common viz library in python\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "# This one is the above on steroids\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from sklearn import linear_model\n",
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "\n",
+    "# These Libs are for stats\n",
+    "import statsmodels.api as sm\n",
+    "from statsmodels.formula.api import ols\n",
+    "\n",
+    "customer = pd.read_csv(r'C:\\Users\\david\\OneDrive\\Ambiente de Trabalho\\Iron Hack\\ficheiros para LAB PANDA 2\\marketing_customer_analysis.csv')\n",
+    "customer\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "cab4f9df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "unnamed:_0                         int64\n",
+       "customer                          object\n",
+       "state                             object\n",
+       "customer_lifetime_value          float64\n",
+       "response                          object\n",
+       "coverage                          object\n",
+       "education                         object\n",
+       "effective_to_date                 object\n",
+       "employmentstatus                  object\n",
+       "gender                            object\n",
+       "income                             int64\n",
+       "location_code                     object\n",
+       "marital_status                    object\n",
+       "monthly_premium_auto               int64\n",
+       "months_since_last_claim          float64\n",
+       "months_since_policy_inception      int64\n",
+       "number_of_open_complaints        float64\n",
+       "number_of_policies                 int64\n",
+       "policy_type                       object\n",
+       "policy                            object\n",
+       "renew_offer_type                  object\n",
+       "sales_channel                     object\n",
+       "total_claim_amount               float64\n",
+       "vehicle_class                     object\n",
+       "vehicle_size                      object\n",
+       "vehicle_type                      object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "customer.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "7bbae7ea",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "unnamed:_0                         int64\n",
+       "customer                          object\n",
+       "state                             object\n",
+       "customer_lifetime_value          float64\n",
+       "response                          object\n",
+       "coverage                          object\n",
+       "education                         object\n",
+       "effective_to_date                 object\n",
+       "employmentstatus                  object\n",
+       "gender                            object\n",
+       "income                             int64\n",
+       "location_code                     object\n",
+       "marital_status                    object\n",
+       "monthly_premium_auto               int64\n",
+       "months_since_last_claim          float64\n",
+       "months_since_policy_inception      int64\n",
+       "number_of_open_complaints        float64\n",
+       "number_of_policies                 int64\n",
+       "policy_type                       object\n",
+       "policy                            object\n",
+       "renew_offer_type                  object\n",
+       "sales_channel                     object\n",
+       "total_claim_amount               float64\n",
+       "vehicle_class                     object\n",
+       "vehicle_size                      object\n",
+       "vehicle_type                      object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "# STANDARDIZING\n",
+    "\n",
+    "cols = []\n",
+    "for i in range(len(customer.columns)): \n",
+    "    cols.append(customer.columns[i].lower().replace(' ', '_')) \n",
+    "customer.columns = cols\n",
+    "\n",
+    "customer\n",
+    "\n",
+    "customer.dtypes\n",
+    "\n",
+    "\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "1b6785a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Normalize (numerical).\n",
+    "\n",
+    "\n",
+    "def normalize_dataframe(customer, columns_to_normalize=None):\n",
+    "    if columns_to_normalize is None:\n",
+    "        \n",
+    "        numerical_columns = customer.select_dtypes(include=['int64', 'float64']).columns\n",
+    "        columns_to_normalize = list(numerical_columns)\n",
+    "    \n",
+    "    customer_normalized = customer.copy()\n",
+    "    \n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    customer_normalized[columns_to_normalize] = scaler.fit_transform(customer_normalized[columns_to_normalize])\n",
+    "    \n",
+    "    return customer_normalized\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "8e64add6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table class=\"simpletable\">\n",
+       "<caption>OLS Regression Results</caption>\n",
+       "<tr>\n",
+       "  <th>Dep. Variable:</th>    <td>total_claim_amount</td> <th>  R-squared (uncentered):</th>      <td>   0.510</td> \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Model:</th>                    <td>OLS</td>        <th>  Adj. R-squared (uncentered):</th> <td>   0.510</td> \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Method:</th>              <td>Least Squares</td>   <th>  F-statistic:       </th>          <td>1.136e+04</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Date:</th>              <td>Mon, 16 Oct 2023</td>  <th>  Prob (F-statistic):</th>           <td>  0.00</td>  \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Time:</th>                  <td>20:58:47</td>      <th>  Log-Likelihood:    </th>          <td> -79898.</td> \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>No. Observations:</th>       <td> 10910</td>       <th>  AIC:               </th>          <td>1.598e+05</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Df Residuals:</th>           <td> 10909</td>       <th>  BIC:               </th>          <td>1.598e+05</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Df Model:</th>               <td>     1</td>       <th>                     </th>              <td> </td>    \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Covariance Type:</th>       <td>nonrobust</td>     <th>                     </th>              <td> </td>    \n",
+       "</tr>\n",
+       "</table>\n",
+       "<table class=\"simpletable\">\n",
+       "<tr>\n",
+       "             <td></td>                <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>customer_lifetime_value</th> <td>    0.0354</td> <td>    0.000</td> <td>  106.598</td> <td> 0.000</td> <td>    0.035</td> <td>    0.036</td>\n",
+       "</tr>\n",
+       "</table>\n",
+       "<table class=\"simpletable\">\n",
+       "<tr>\n",
+       "  <th>Omnibus:</th>       <td>977.749</td> <th>  Durbin-Watson:     </th> <td>   1.628</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td>6781.308</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Skew:</th>          <td>-0.053</td>  <th>  Prob(JB):          </th> <td>    0.00</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "  <th>Kurtosis:</th>      <td> 6.861</td>  <th>  Cond. No.          </th> <td>    1.00</td>\n",
+       "</tr>\n",
+       "</table><br/><br/>Notes:<br/>[1] R² is computed without centering (uncentered) since the model does not contain a constant.<br/>[2] Standard Errors assume that the covariance matrix of the errors is correctly specified."
+      ],
+      "text/latex": [
+       "\\begin{center}\n",
+       "\\begin{tabular}{lclc}\n",
+       "\\toprule\n",
+       "\\textbf{Dep. Variable:}            & total\\_claim\\_amount & \\textbf{  R-squared (uncentered):}      &     0.510   \\\\\n",
+       "\\textbf{Model:}                    &         OLS          & \\textbf{  Adj. R-squared (uncentered):} &     0.510   \\\\\n",
+       "\\textbf{Method:}                   &    Least Squares     & \\textbf{  F-statistic:       }          & 1.136e+04   \\\\\n",
+       "\\textbf{Date:}                     &   Mon, 16 Oct 2023   & \\textbf{  Prob (F-statistic):}          &     0.00    \\\\\n",
+       "\\textbf{Time:}                     &       20:58:47       & \\textbf{  Log-Likelihood:    }          &   -79898.   \\\\\n",
+       "\\textbf{No. Observations:}         &         10910        & \\textbf{  AIC:               }          & 1.598e+05   \\\\\n",
+       "\\textbf{Df Residuals:}             &         10909        & \\textbf{  BIC:               }          & 1.598e+05   \\\\\n",
+       "\\textbf{Df Model:}                 &             1        & \\textbf{                     }          &             \\\\\n",
+       "\\textbf{Covariance Type:}          &      nonrobust       & \\textbf{                     }          &             \\\\\n",
+       "\\bottomrule\n",
+       "\\end{tabular}\n",
+       "\\begin{tabular}{lcccccc}\n",
+       "                                   & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\\n",
+       "\\midrule\n",
+       "\\textbf{customer\\_lifetime\\_value} &       0.0354  &        0.000     &   106.598  &         0.000        &        0.035    &        0.036     \\\\\n",
+       "\\bottomrule\n",
+       "\\end{tabular}\n",
+       "\\begin{tabular}{lclc}\n",
+       "\\textbf{Omnibus:}       & 977.749 & \\textbf{  Durbin-Watson:     } &    1.628  \\\\\n",
+       "\\textbf{Prob(Omnibus):} &   0.000 & \\textbf{  Jarque-Bera (JB):  } & 6781.308  \\\\\n",
+       "\\textbf{Skew:}          &  -0.053 & \\textbf{  Prob(JB):          } &     0.00  \\\\\n",
+       "\\textbf{Kurtosis:}      &   6.861 & \\textbf{  Cond. No.          } &     1.00  \\\\\n",
+       "\\bottomrule\n",
+       "\\end{tabular}\n",
+       "%\\caption{OLS Regression Results}\n",
+       "\\end{center}\n",
+       "\n",
+       "Notes: \\newline\n",
+       " [1] R² is computed without centering (uncentered) since the model does not contain a constant. \\newline\n",
+       " [2] Standard Errors assume that the covariance matrix of the errors is correctly specified."
+      ],
+      "text/plain": [
+       "<class 'statsmodels.iolib.summary.Summary'>\n",
+       "\"\"\"\n",
+       "                                 OLS Regression Results                                \n",
+       "=======================================================================================\n",
+       "Dep. Variable:     total_claim_amount   R-squared (uncentered):                   0.510\n",
+       "Model:                            OLS   Adj. R-squared (uncentered):              0.510\n",
+       "Method:                 Least Squares   F-statistic:                          1.136e+04\n",
+       "Date:                Mon, 16 Oct 2023   Prob (F-statistic):                        0.00\n",
+       "Time:                        20:58:47   Log-Likelihood:                         -79898.\n",
+       "No. Observations:               10910   AIC:                                  1.598e+05\n",
+       "Df Residuals:                   10909   BIC:                                  1.598e+05\n",
+       "Df Model:                           1                                                  \n",
+       "Covariance Type:            nonrobust                                                  \n",
+       "===========================================================================================\n",
+       "                              coef    std err          t      P>|t|      [0.025      0.975]\n",
+       "-------------------------------------------------------------------------------------------\n",
+       "customer_lifetime_value     0.0354      0.000    106.598      0.000       0.035       0.036\n",
+       "==============================================================================\n",
+       "Omnibus:                      977.749   Durbin-Watson:                   1.628\n",
+       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6781.308\n",
+       "Skew:                          -0.053   Prob(JB):                         0.00\n",
+       "Kurtosis:                       6.861   Cond. No.                         1.00\n",
+       "==============================================================================\n",
+       "\n",
+       "Notes:\n",
+       "[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n",
+       "[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
+       "\"\"\""
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#  X-y split.\n",
+    "\n",
+    "Y = customer['total_claim_amount']\n",
+    "X = customer['customer_lifetime_value']\n",
+    "\n",
+    "# apply linear regression on the following data\n",
+    "\n",
+    "model = sm.OLS(Y,X).fit()\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "830d4a01",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4188ab5c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf896d89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}