From 54b0c931a2eb35be2886f049a1f414633b8fc457 Mon Sep 17 00:00:00 2001 From: Leonardo Esteban Pagliacci <110601781+leonardo-pagliacci@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:22:52 +0100 Subject: [PATCH] lab 5 solved --- ..._analysis_round5] leonardo_pagliacci.ipynb | 784 ++++++++++++++++++ 1 file changed, 784 insertions(+) create mode 100644 [lab_customer_analysis_round5] leonardo_pagliacci.ipynb diff --git a/[lab_customer_analysis_round5] leonardo_pagliacci.ipynb b/[lab_customer_analysis_round5] leonardo_pagliacci.ipynb new file mode 100644 index 0000000..ae9b5f8 --- /dev/null +++ b/[lab_customer_analysis_round5] leonardo_pagliacci.ipynb @@ -0,0 +1,784 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bc9713bb", + "metadata": {}, + "source": [ + "# Lab | Customer Analysis Round 5\n", + "\n", + "For this lab, we still keep using the `marketing_customer_analysis.csv` file that you can find in the `files_for_lab` folder.\n", + "\n", + "### Get the data\n", + "\n", + "We are using the `marketing_customer_analysis.csv` file.\n", + "\n", + "### Dealing with the data\n", + "\n", + "Already done in the round 2.\n", + "\n", + "### Explore the data\n", + "\n", + "Done in the round 3.\n", + "\n", + "### Processing Data\n", + "\n", + "(_Further processing..._)\n", + "\n", + "- X-y split.\n", + "- Normalize (numerical).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "332f3c58", + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the libraries\n", + "\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "0112f86c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgenderincome...months_since_policy_inceptionnumber_of_open_complaintsnumber_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_size
0DK49336Arizona4809.216960NoBasicCollege2/18/11EmployedM48029...520.09Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsize
1KX64629California2228.525238NoBasicCollege1/18/11UnemployedF0...260.01Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsize
2LZ68649Washington14947.917300NoBasicBachelor2/10/11EmployedM22139...310.02Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsize
3XL78013Oregon22332.439460YesExtendedCollege1/11/11EmployedM49078...30.02Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsize
4QA50777Oregon9025.067525NoPremiumBachelor1/17/11Medical LeaveF23675...31NaN7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsize
..................................................................
10905FE99816Nevada15563.369440NoPremiumBachelor1/19/11UnemployedF0...40NaN7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsize
10906KX53892Oregon5259.444853NoBasicCollege1/6/11EmployedF61146...680.06Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsize
10907TL39050Arizona23893.304100NoExtendedBachelor2/6/11EmployedF39837...630.02Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsize
10908WA60547California11971.977650NoPremiumCollege2/13/11EmployedF64195...274.06Personal AutoPersonal L1Offer1Branch618.288849SUVMedsize
10909IV32877NaN6857.519928NaNBasicBachelor1/8/11UnemployedM0...10.03Personal AutoPersonal L1Offer4Web1021.719397SUVMedsize
\n", + "

10910 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value response coverage \\\n", + "0 DK49336 Arizona 4809.216960 No Basic \n", + "1 KX64629 California 2228.525238 No Basic \n", + "2 LZ68649 Washington 14947.917300 No Basic \n", + "3 XL78013 Oregon 22332.439460 Yes Extended \n", + "4 QA50777 Oregon 9025.067525 No Premium \n", + "... ... ... ... ... ... \n", + "10905 FE99816 Nevada 15563.369440 No Premium \n", + "10906 KX53892 Oregon 5259.444853 No Basic \n", + "10907 TL39050 Arizona 23893.304100 No Extended \n", + "10908 WA60547 California 11971.977650 No Premium \n", + "10909 IV32877 NaN 6857.519928 NaN Basic \n", + "\n", + " education effective_to_date employmentstatus gender income ... \\\n", + "0 College 2/18/11 Employed M 48029 ... \n", + "1 College 1/18/11 Unemployed F 0 ... \n", + "2 Bachelor 2/10/11 Employed M 22139 ... \n", + "3 College 1/11/11 Employed M 49078 ... \n", + "4 Bachelor 1/17/11 Medical Leave F 23675 ... \n", + "... ... ... ... ... ... ... \n", + "10905 Bachelor 1/19/11 Unemployed F 0 ... \n", + "10906 College 1/6/11 Employed F 61146 ... \n", + "10907 Bachelor 2/6/11 Employed F 39837 ... \n", + "10908 College 2/13/11 Employed F 64195 ... \n", + "10909 Bachelor 1/8/11 Unemployed M 0 ... \n", + "\n", + " months_since_policy_inception number_of_open_complaints \\\n", + "0 52 0.0 \n", + "1 26 0.0 \n", + "2 31 0.0 \n", + "3 3 0.0 \n", + "4 31 NaN \n", + "... ... ... \n", + "10905 40 NaN \n", + "10906 68 0.0 \n", + "10907 63 0.0 \n", + "10908 27 4.0 \n", + "10909 1 0.0 \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "... ... ... ... ... \n", + "10905 7 Personal Auto Personal L1 Offer3 \n", + "10906 6 Personal Auto Personal L3 Offer2 \n", + "10907 2 Corporate Auto Corporate L3 Offer1 \n", + "10908 6 Personal Auto Personal L1 Offer1 \n", + "10909 3 Personal Auto Personal L1 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "... ... ... ... ... \n", + "10905 Web 1214.400000 Luxury Car Medsize \n", + "10906 Branch 273.018929 Four-Door Car Medsize \n", + "10907 Web 381.306996 Luxury SUV Medsize \n", + "10908 Branch 618.288849 SUV Medsize \n", + "10909 Web 1021.719397 SUV Medsize \n", + "\n", + "[10910 rows x 24 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Importing the dataframe\n", + "\n", + "customer_df = pd.read_csv(\"/Users/leozinho.air/Desktop/ironhack_da/class_04/lab-customer-analysis-round-2/files_for_lab/csv_files/marketing_customer_analysis.csv\")\n", + "\n", + "# Cleaning operations\n", + "\n", + "columns = []\n", + "\n", + "for i in range(len(customer_df.columns)):\n", + " columns.append(customer_df.columns[i].lower().replace(' ', '_'))\n", + "\n", + "customer_df.columns = columns\n", + "\n", + "customer_df\n", + "\n", + "# Dropping the column 'unnamed:_0'\n", + "\n", + "customer_df = customer_df.drop(['unnamed:_0'], axis = 1)\n", + "\n", + "# The only values are Nan or vehicle class 'A', let's create a new df without the column.\n", + "\n", + "customer_df = customer_df.drop(['vehicle_type'], axis = 1)\n", + "\n", + "customer_df" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "958cdaf7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This is for predictions\n", + "from sklearn import linear_model\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "\n", + "# These Libs are for stats -> this ones are for description\n", + "import statsmodels.api as sm\n", + "from statsmodels.formula.api import ols\n", + "\n", + "# Creating a dataframe with only numerical values\n", + "\n", + "num_cust = customer_df.select_dtypes(include = np.number)\n", + "\n", + "# Checking for null values\n", + "\n", + "num_cust.isnull().any() # Two columns have nan values\n", + "\n", + "# I chose to fill the nan valus with the means of the two columns\n", + "\n", + "mean_months = num_cust['months_since_last_claim'].mean()\n", + "\n", + "num_cust['months_since_last_claim'].fillna(value = mean_months, inplace = True)\n", + "\n", + "\n", + "mean_complaints = num_cust['number_of_open_complaints'].mean()\n", + "num_cust['number_of_open_complaints'].fillna(value = mean_complaints, inplace = True)\n", + "\n", + "num_cust.isnull().sum().sum() # There are no nan values\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "95c51d22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_valueincomemonthly_premium_automonths_since_last_claimmonths_since_policy_inceptionnumber_of_open_complaintsnumber_of_policiestotal_claim_amount
00.0357520.4803810.0000000.2000000.5252530.0000001.0000.101171
10.0040590.0000000.0126580.0857140.2626260.0000000.0000.257445
20.1602640.2214320.1645570.9714290.3131310.0000000.1250.165875
30.2509530.4908730.1518990.2857140.0303030.0000000.1250.167263
40.0875270.2367950.2362870.4328310.3131310.0768510.7500.244657
...........................
109050.1678230.0000000.8101270.4328310.4040400.0768510.7500.419717
109060.0412810.6115760.0168780.2000000.6868690.0000000.6250.094333
109070.2701220.3984460.5907170.3142860.6363640.0000000.1250.131763
109080.1237170.6420720.4092830.0000000.2727270.8000000.6250.213674
109090.0609070.0000000.1687760.8857140.0101010.0000000.2500.353118
\n", + "

10910 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.035752 0.480381 0.000000 \n", + "1 0.004059 0.000000 0.012658 \n", + "2 0.160264 0.221432 0.164557 \n", + "3 0.250953 0.490873 0.151899 \n", + "4 0.087527 0.236795 0.236287 \n", + "... ... ... ... \n", + "10905 0.167823 0.000000 0.810127 \n", + "10906 0.041281 0.611576 0.016878 \n", + "10907 0.270122 0.398446 0.590717 \n", + "10908 0.123717 0.642072 0.409283 \n", + "10909 0.060907 0.000000 0.168776 \n", + "\n", + " months_since_last_claim months_since_policy_inception \\\n", + "0 0.200000 0.525253 \n", + "1 0.085714 0.262626 \n", + "2 0.971429 0.313131 \n", + "3 0.285714 0.030303 \n", + "4 0.432831 0.313131 \n", + "... ... ... \n", + "10905 0.432831 0.404040 \n", + "10906 0.200000 0.686869 \n", + "10907 0.314286 0.636364 \n", + "10908 0.000000 0.272727 \n", + "10909 0.885714 0.010101 \n", + "\n", + " number_of_open_complaints number_of_policies total_claim_amount \n", + "0 0.000000 1.000 0.101171 \n", + "1 0.000000 0.000 0.257445 \n", + "2 0.000000 0.125 0.165875 \n", + "3 0.000000 0.125 0.167263 \n", + "4 0.076851 0.750 0.244657 \n", + "... ... ... ... \n", + "10905 0.076851 0.750 0.419717 \n", + "10906 0.000000 0.625 0.094333 \n", + "10907 0.000000 0.125 0.131763 \n", + "10908 0.800000 0.625 0.213674 \n", + "10909 0.000000 0.250 0.353118 \n", + "\n", + "[10910 rows x 8 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# X-y split.\n", + "\n", + "Y = customer_df['total_claim_amount']\n", + "X = customer_df.drop(['total_claim_amount'], axis = 1)\n", + "\n", + "# Normalizing the DataFrame\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "scaler = MinMaxScaler() # This is the normalization process\n", + "\n", + "normalized_data = scaler.fit_transform(num_cust) # Fit it to the data\n", + "normalized_data = pd.DataFrame(normalized_data, columns = num_cust.columns)\n", + "\n", + "normalized_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccdae9d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}