ta-data-mexpt · patriciacolemus · Feb 12, 2022
diff --git a/Housing_v6-1.py b/Housing_v6-1.py
@@ -0,0 +1,310 @@
+# -------------------------------------
+# Project 5 - Statistical Analysis
+# V 6
+# -------------------------------------
+
+############## Libraries ##############
+from operator import index
+from urllib.request import DataHandler
+import pandas as pd
+import numpy as np
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.tools.tools import add_constant
+import statsmodels.formula.api as smf
+
+# Statistical modeling
+import sklearn
+from sklearn.preprocessing import StandardScaler 
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import Ridge, Lasso
+import sklearn.metrics as metrics
+import statsmodels.api as sm
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+
+
+############## Dataset ##############
+data = pd.read_csv('/Users/patriciac/Documents/IRON/04_Proyectos/House pricing/train - train.csv')
+data.head()
+data.info()
+data.shape()
+data.dtypes
+
+############## Data cleaning ##############
+missing_val = data.isnull().sum()
+(missing_val[missing_val > 0]/data.shape[0]*100).sort_values(ascending=False)
+
+## Removed features
+# Because Alley, Fence and MiscFeature have more than 80% of null values, and compared to other features these ones are not so relevant to the price of a house, 
+# they were dropped. 
+data = data.drop(['Alley','Fence','MiscFeature'], axis =1 )
+
+## Fill NA
+# It was observed the NaN values what actually mean is the absence of that feature, for example if there is no garage, then there is no garage type. Then, if there 
+# is no feature, the null values in its characteristics were replaced by '0'. 
+
+# In 'PoolQC' all NA values, which mean 'no pool', where replaced by '0'
+data['PoolQC'] = data['PoolQC'].fillna(0)
+
+# Additionally, FireplaceQu (Fireplace quality) has more than 47% of null values. After comparing the NaN values in FireplaceQu and the values in Firelaces, it was
+# decided to fill the NaN with '0'. 
+data[data['FireplaceQu'].isnull()][['Fireplaces','FireplaceQu']].head()
+data['FireplaceQu']=data['FireplaceQu'].fillna(0)
+
+# The features related to garage were filled with '0' when there is no garage. 
+data['GarageType'] = data['GarageType'].fillna(0)
+data['GarageYrBlt'] = data['GarageYrBlt'].fillna(0)
+data['GarageFinish'] = data['GarageFinish'].fillna(0)
+data['GarageQual'] = data['GarageQual'].fillna(0)
+data['GarageCond'] = data['GarageCond'].fillna(0)
+
+# Where there is no basement or second basement, the features related to them were replaced by '0'
+data['BsmtFinType2'] = data['BsmtFinType2'].fillna(0)
+data['BsmtQual'] = data['BsmtQual'].fillna(0)
+data['BsmtFinType1'] = data['BsmtFinType1'].fillna(0)
+data['BsmtExposure'] = data['BsmtExposure'].fillna(0)
+data['BsmtCond'] = data['BsmtCond'].fillna(0)
+data['BsmtFinType1'] = data['BsmtFinType1'].fillna(0)
+
+# Electrical system has only 0.6% null value, which were also replaced by '0'
+data['Electrical'] = data['Electrical'].fillna(0)
+
+# For the feature 'MasVnrType', masonry veneer type, all the rows with 'none' were replaced by 0
+data['MasVnrType'] = data['MasVnrType'].fillna(0)
+# The area of the masonry veneer where actually this feature does not exit is also '0'
+data['MasVnrArea'] = data['MasVnrArea'].fillna(0)
+
+# Feature 'LotFrontage' was filled with '0' instead of NaN values 
+data['LotFrontage'] = data['LotFrontage'].fillna(0) 
+
+## New variables 
+# To reduce the number of variables, all baths and half baths were summarized in one single columns
+data['Total_baths'] = data.BsmtFullBath + data.BsmtHalfBath + data.FullBath + data.HalfBath
+data.drop(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis= 1, inplace= True)
+
+# Variables GarageCars and GarageArea both measure garage with different units, then GarageCars was dropped
+data.drop(['GarageCars'], axis= 1, inplace= True)
+
+# 'YearBuilt' was replaced by 'Age', which si the total years since the age building till 2022
+data['Age'] = 2022 - data.YearBuilt
+data.drop(['YearBuilt'], axis = 1, inplace = True)
+
+# 'Id' variable does provides much information, so it was dropped. 
+data.drop(['Id'], axis= 1, inplace= True)
+
+# Data was later classified by type, ir order to be used in different models according to ther category
+numerical_data = data._get_numeric_data()
+numeric_cols = list(numerical_data.columns)
+
+categorical_data = data.drop(numeric_cols, axis= 1)
+categorical_data.drop(categorical_data, axis = 1, inplace = True)
+categorical_cols = list(categorical_data.columns)
+# Categorical then are transformed into numerical through dummies
+data = pd.get_dummies(data = data, columns= categorical_cols, drop_first=True)
+
+data_clean = data
+
+
+
+
+############## Exploratory Analysis ##############
+## Sale Price distribution 
+fig,axes = plt.subplots(figsize = (16,8))
+ax = sns.distplot(data.SalePrice, rug=False, kde_kws={"color": "darksalmon", "lw": 3, "label": "KDE"}, hist_kws={"histtype": "step", "linewidth": 3, "alpha": 1, "color": "cornflowerblue"})
+plt.show()
+data.describe()
+# Observation: 
+# The Sale Price distribution is positively skewed, which means an increase in the prices. 
+
+# Transformation of sale price
+data['SalePrice_log'] = np.log(data['SalePrice'])
+data.drop(['SalePrice'], axis = 1, inplace= True)
+fig,axes = plt.subplots(figsize = (16,8))
+ax = sns.distplot(data.SalePrice_log, rug=False, kde_kws={"color": "darksalmon", "lw": 3, "label": "KDE"}, hist_kws={"histtype": "step", "linewidth": 3, "alpha": 1, "color": "cornflowerblue"})
+plt.show()
+data.describe()
+# Observation: 
+# After a transformation to log, the sale price distribution follows a near normal distribution.  
+
+## Correlation 
+# Heatmap 
+plt.figure(figsize=(16,8))
+# Create a mask to only show the bottom triangle 
+mask = np.triu(np.ones_like(data.corr(), dtype=bool))
+sns.heatmap(data.corr(), annot=False, mask=mask, vmin=-1, vmax=1)
+plt.title('Predictors correlation coefficient')
+plt.show()
+
+cor_data =data.corr()["SalePrice_log"].abs().sort_values(ascending = False).reset_index()
+cor_05 = cor_data[cor_data.SalePrice_log > 0.5]
+
+print('The variables stronger correlated to SalePrice are:', cor_05.loc[1:, ])
+
+# Based on the correlation matrix, 'OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'Total_baths', 
+# '1stFlrSF', 'TotRmsAbvGrd', 'Age', 'YearRemodAdd' and 'TotRmsAbvGrd' will be the features for the linear regression model. 
+# How these features vary with SalePrice?
+plt.figure(figsize= 25, 10))
+features = ['OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'Total_baths', '1stFlrSF', 'TotRmsAbvGrd', 'Age','YearRemodAdd']
+target = data['SalePrice']
+for i, col in enumerate(features):
+    plt.subplot(1, len(features), i+1)
+    x = data[col]
+    y = target
+    plt.scatter(x, y, marker = 'o')
+    plt.title(col)
+    plt.xlabel(col)
+    plt.ylabel('Sale Price')
+    plt.show()
+# Observations: 
+# The prices increases as the OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'Total_baths', '1stFlrSF' and 'TotRmsAbvGrd' increase. 
+# There are few outliers, most of them in  'GrLivArea', 'GarageArea' and 'TotalBsmtSF'
+# As th Age increases, the price decreases. 
+
+## Hypothesis testing
+# H0: all the features impact on the sale price
+# HA: some features might impact, while other will not affect it at all. 
+
+hyp_test = data.copy()
+
+def p_value(data,ic):
+    y = data['SalePrice_log']
+    p_value = []
+    for column in data.columns:
+        x = data[column]
+        slope, intercept, rvalue, pvalue, stderr = linregress(x, y)
+        p_value.append((column,pvalue))  
+    pvalue_d = pd.DataFrame(p_value,columns=['Variable','P_Value'])
+    pvalue_d['H0'] = pvalue_d['P_Value'] <= ic     
+    pvalue_d['H0'] = pvalue_d['H0'].astype('str')
+    pvalue_d['H0'].replace({'True':'Statistically significant','False':'Statistically no significant'},inplace=True)
+
+    return pvalue_d
+
+# Features statistically significant 
+variables_df = p_value(hyp_test, 0.05)
+variables_pval = variables_df[(variables_df.H0 == 'Statistically significant')]
+var_pval_list = list(variables_pval.Variable)
+
+
+### Lineal regression
+## In order to perform a good lineal regression there are two key assumptions:
+# Multivariate Normality: Multiple regression assumes that the residuals are normally distributed.
+# No Multicollinearity: Multiple regression assumes that the independent variables are not highly correlated with each other. This assumption is tested using Variance Inflation Factor (VIF) values.
+
+## VIF 
+## Computing VIF (Variange Inflation Factor), which gauges how much a feature’s inclusion contributes to the overall variance of the coefficients of the features in the model
+# VIF = 1, there is no multicollinearity
+# If VIF < 1, but VIF < 5, good multicollinearity
+
+# Compute the vif for all numeric features:
+def compute_vif(considered_features):
+    X = data[considered_features]
+    # the calculation of variance inflation requires a constant
+    X['intercept'] = 1  # the calculation of variance inflation requires a constant
+    # create dataframe to store vif values
+    vif = pd.DataFrame() # create a dataframe to store vif values
+    vif["Variable"] = X.columns
+    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
+    vif = vif[vif['Variable']!='intercept']
+    return vif
+
+# Applying compute_vif function to the variables highly correlated with other variables 
+considered_features = numeric_cols
+compute_vif(considered_features).sort_values('VIF', ascending=False)
+
+# There are 9 features with a result of "inf", which means  there is perfect collinearity. 
+# These represent completely redundant variables. 
+
+# Looking at the description of the variables, these 9 variables were transformed: 
+# 'BsmtFinSF1', 'BsmtFinSF2', and 'BsmtUnfSF' represent  'TotalBsmtSF', so the first three variables were removed
+data.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis= 1, inplace= True)
+# '1stFlrSF' and '2ndFlrSF' both sum the 'GrLivArea', and both are also evaluated as one feature in 'LowQualFinSF'. 
+# Therefore, '1stFlrSF' and '2ndFlrSF' were removed
+data.drop(['1stFlrSF','2ndFlrSF'], axis= 1, inplace= True)
+
+# VIF computation without features evaluated as 'inf'
+compute_vif(considered_features).sort_values('VIF', ascending=False)
+
+# Following the rules, 'GrLivArea' was removed
+# Highest VIF: GrLivArea = 5.265502
+data_2 = data.drop(['GrLivArea'], axis =1 )
+
+### Lineal regression 
+variables_lm = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
+       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
+       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
+       'OverallQual', 'OverallCond', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
+       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
+       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
+       'BsmtFinType1', 'BsmtFinType2', 'TotalBsmtSF', 'Heating', 'HeatingQC',
+       'CentralAir', 'Electrical', 'LowQualFinSF', 'GrLivArea', 'BedroomAbvGr',
+       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
+       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
+       'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
+       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
+       'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscVal', 'MoSold', 'YrSold',
+       'SaleType', 'SaleCondition', 'Total_baths', 'Age']
+
+
+data_lm = data[variables_lm]
+
+# Check how many features are  no numeric
+numerical_data = data_lm._get_numeric_data()
+numeric_cols = list(numerical_data.columns)
+
+categorical_data = data.drop(numeric_cols, axis= 1)
+categorical_data = categorical_data.drop(['SalePrice_log'], axis = 1)
+# categorical_data.drop(categorical_data, axis = 1, inplace = True)
+categorical_cols = list(categorical_data.columns)
+
+
+data_lm = pd.get_dummies(data = data_lm, columns= categorical_cols, drop_first=True)
+
+## Linear regression 
+# Split the dataset into random train and test subset
+# X = general information about the houses
+# y = sale price
+y = data['SalePrice_log']
+train_test_split(data_lm, y)
+
+X_train, X_test, y_train, y_test = train_test_split(data_lm, y, random_state = 10, train_size=0.8) #test_size = 0.2
+
+## Linear Regression model using training data to fit and test data to predict
+linreg = LinearRegression()
+linreg.fit(X_train, y_train)
+linreg.coef_
+
+data_train = pd.DataFrame(linreg.coef_, data_lm.columns)
+linreg.intercept_
+
+# R2: percentage of the variance in the dependent variable that the independent variables explain collectively 
+linreg.score(X_train, y_train)
+linreg.score(X_test, y_test)
+
+predicted = linreg.predict(X_test)
+
+df_compared = pd.DataFrame({'Real': y_test, 'Prediction': predicted})
+df_compared
+
+print(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test, predicted)}')
+print(f'Mean Squared Error: {metrics.mean_squared_error(y_test, predicted)}')
+print(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(y_test, predicted))}')
+
+## OLS model using the training data to fit and predict
+X = sm.add_constant(data_lm)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2)
+model = sm.OLS(y_train, X_train)
+results = model.fit()
+results.summary()
+
+# Observation:
+# After the data transformation of the data, and the VIF test the model improved, and now with the data analyzed the model can predict around 95% of the data. 
+# Because the pvalue is less than 0.05, we can confirm the features analyzed are statistically significant. 
+# However the model still can be improved, if more cleaning is performed. 
+
+