diff --git a/HOUSE.ipynb b/HOUSE.ipynb new file mode 100644 index 000000000..406333545 --- /dev/null +++ b/HOUSE.ipynb @@ -0,0 +1,1096 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1087, + "status": "ok", + "timestamp": 1565544712205, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "M-vp2gfDu-kp", + "outputId": "7ceab647-acd7-415b-f657-0033c140f6af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 3877, + "status": "ok", + "timestamp": 1565544768309, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "5hkcGKXQvLPn", + "outputId": "9198c5b1-0cad-4876-d1e9-429d0e9f6232" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_description.txt house.ipynb sample_submission.csv test.csv train.csv\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "KYyLDKmPygST" + }, + "outputs": [], + "source": [ + "import numpy as np \n", + "import pandas as pd\n", + "#import matplotlib.pyplot as plt\n", + "#import seaborn as sns\n", + "\n", + "import os\n", + "for dirname, _, filenames in os.walk('/kaggle/input'):\n", + " for filename in filenames:\n", + " print(os.path.join(dirname, filename))\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "A8OJkSDQygO-" + }, + "outputs": [], + "source": [ + "train=pd.read_csv(\"train.csv\")\n", + "train.head()\n", + "test=pd.read_csv(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1059, + "status": "ok", + "timestamp": 1565546184243, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "yGAT1pwcygLv", + "outputId": "f811109b-ac53-45f2-a4d2-1c30c3a9b0db" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "th train data has 1460 rows and 81 features\n", + "the test data has 1459 rows and 80 features\n" + ] + } + ], + "source": [ + "print('th train data has {} rows and {} features'.format(train.shape[0],train.shape[1]))\n", + "print('the test data has {} rows and {} features'.format(test.shape[0],test.shape[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1046, + "status": "ok", + "timestamp": 1565546188319, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "sKP9JiP6ygIR", + "outputId": "65152c8f-56f4-46d2-8a3a-3dbe3741ab36" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tha data has 2919 rows and 80 features\n" + ] + } + ], + "source": [ + "data=pd.concat([train.iloc[:,:-1],test],axis=0)\n", + "print('tha data has {} rows and {} features'.format(data.shape[0],data.shape[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 323 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1062, + "status": "ok", + "timestamp": 1565546190763, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "dofqtiXxygEL", + "outputId": "5ab208ef-66e7-4f70-a7cc-4123667110f0" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n", + " 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n", + " 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n", + " 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n", + " 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n", + " 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n", + " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n", + " 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n", + " 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n", + " 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n", + " 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n", + " 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n", + " 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n", + " 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n", + " 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n", + " 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n", + " 'SaleCondition'],\n", + " dtype='object')" + ] + }, + "execution_count": 10, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1041, + "status": "ok", + "timestamp": 1565546194423, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "08YBWNjryf_v", + "outputId": "23b8cc17-9ae8-4ef9-ba73-9da91e1e49c4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 2919 entries, 0 to 1458\n", + "Data columns (total 80 columns):\n", + "Id 2919 non-null int64\n", + "MSSubClass 2919 non-null int64\n", + "MSZoning 2915 non-null object\n", + "LotFrontage 2433 non-null float64\n", + "LotArea 2919 non-null int64\n", + "Street 2919 non-null object\n", + "Alley 198 non-null object\n", + "LotShape 2919 non-null object\n", + "LandContour 2919 non-null object\n", + "Utilities 2917 non-null object\n", + "LotConfig 2919 non-null object\n", + "LandSlope 2919 non-null object\n", + "Neighborhood 2919 non-null object\n", + "Condition1 2919 non-null object\n", + "Condition2 2919 non-null object\n", + "BldgType 2919 non-null object\n", + "HouseStyle 2919 non-null object\n", + "OverallQual 2919 non-null int64\n", + "OverallCond 2919 non-null int64\n", + "YearBuilt 2919 non-null int64\n", + "YearRemodAdd 2919 non-null int64\n", + "RoofStyle 2919 non-null object\n", + "RoofMatl 2919 non-null object\n", + "Exterior1st 2918 non-null object\n", + "Exterior2nd 2918 non-null object\n", + "MasVnrType 2895 non-null object\n", + "MasVnrArea 2896 non-null float64\n", + "ExterQual 2919 non-null object\n", + "ExterCond 2919 non-null object\n", + "Foundation 2919 non-null object\n", + "BsmtQual 2838 non-null object\n", + "BsmtCond 2837 non-null object\n", + "BsmtExposure 2837 non-null object\n", + "BsmtFinType1 2840 non-null object\n", + "BsmtFinSF1 2918 non-null float64\n", + "BsmtFinType2 2839 non-null object\n", + "BsmtFinSF2 2918 non-null float64\n", + "BsmtUnfSF 2918 non-null float64\n", + "TotalBsmtSF 2918 non-null float64\n", + "Heating 2919 non-null object\n", + "HeatingQC 2919 non-null object\n", + "CentralAir 2919 non-null object\n", + "Electrical 2918 non-null object\n", + "1stFlrSF 2919 non-null int64\n", + "2ndFlrSF 2919 non-null int64\n", + "LowQualFinSF 2919 non-null int64\n", + "GrLivArea 2919 non-null int64\n", + "BsmtFullBath 2917 non-null float64\n", + "BsmtHalfBath 2917 non-null float64\n", + "FullBath 2919 non-null int64\n", + "HalfBath 2919 non-null int64\n", + "BedroomAbvGr 2919 non-null int64\n", + "KitchenAbvGr 2919 non-null int64\n", + "KitchenQual 2918 non-null object\n", + "TotRmsAbvGrd 2919 non-null int64\n", + "Functional 2917 non-null object\n", + "Fireplaces 2919 non-null int64\n", + "FireplaceQu 1499 non-null object\n", + "GarageType 2762 non-null object\n", + "GarageYrBlt 2760 non-null float64\n", + "GarageFinish 2760 non-null object\n", + "GarageCars 2918 non-null float64\n", + "GarageArea 2918 non-null float64\n", + "GarageQual 2760 non-null object\n", + "GarageCond 2760 non-null object\n", + "PavedDrive 2919 non-null object\n", + "WoodDeckSF 2919 non-null int64\n", + "OpenPorchSF 2919 non-null int64\n", + "EnclosedPorch 2919 non-null int64\n", + "3SsnPorch 2919 non-null int64\n", + "ScreenPorch 2919 non-null int64\n", + "PoolArea 2919 non-null int64\n", + "PoolQC 10 non-null object\n", + "Fence 571 non-null object\n", + "MiscFeature 105 non-null object\n", + "MiscVal 2919 non-null int64\n", + "MoSold 2919 non-null int64\n", + "YrSold 2919 non-null int64\n", + "SaleType 2918 non-null object\n", + "SaleCondition 2919 non-null object\n", + "dtypes: float64(11), int64(26), object(43)\n", + "memory usage: 1.8+ MB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bbOas4Dayf8E" + }, + "outputs": [], + "source": [ + "data = data.drop(columns=['Id'],axis=1)\n", + "num_features=data.select_dtypes(include=['int64','float64'])\n", + "categorical_features=data.select_dtypes(include='object')\n", + "num_features.describe()\n", + "\n", + "categorical_features.describe()\n", + "\n", + "data.isnull().sum().sort_values(ascending=False)[:34]\n", + "#print(categorical_features.isnull().sum().sort_values(ascending=False)[:23])\n", + "#num_features.isnull().sum().sort_values(ascending=False)[:11]\n", + "\n", + "f = open(\"data_description.txt\", \"r\")\n", + "#print(f.read())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "nAqBsGDEyf4b" + }, + "outputs": [], + "source": [ + "#data['LotFrontage'].fillna(int(data['LotFrontage'].mean()),inplace=True)\n", + "data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))\n", + "data['LotFrontage'].isnull().sum()\n", + "\n", + "#create a new class 'other'\n", + "features=['Functional','Utilities','Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st','PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtCond','BsmtExposure','BsmtQual','BsmtFinType2','BsmtFinType1','MasVnrType']\n", + "for name in features:\n", + " data[name].fillna('Other',inplace=True)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_3NIcRPzyfzq" + }, + "outputs": [], + "source": [ + "data[features].isnull().sum()\n", + "data['MSZoning'] = data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1284, + "status": "ok", + "timestamp": 1565546206798, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "qaczacuRyfv6", + "outputId": "9ac9f315-b8bd-49bf-b0ef-b5ce24d81586" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\"mode=['MSZoning','Functional','Utilities','Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st']\\nfor name in mode:\\n data[name].fillna(data[name].mode()[0],inplace=True)\"" + ] + }, + "execution_count": 15, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#use the mode value for the rest of the features\n", + "\"\"\"mode=['MSZoning','Functional','Utilities','Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st']\n", + "for name in mode:\n", + " data[name].fillna(data[name].mode()[0],inplace=True)\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DIKkjz80yfsQ" + }, + "outputs": [], + "source": [ + "zero=['GarageYrBlt','GarageArea','MasVnrArea','BsmtHalfBath','BsmtHalfBath','BsmtFullBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','GarageCars']\n", + "for name in zero:\n", + " data[name].fillna(0,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1115, + "status": "ok", + "timestamp": 1565546210895, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "NUOayYdTzfkS", + "outputId": "c5366f2d-c331-4b1c-e603-d7d410e41be0" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',\n", + " 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',\n", + " 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',\n", + " 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',\n", + " 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',\n", + " 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',\n", + " 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',\n", + " 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',\n", + " 'SaleType', 'SaleCondition'],\n", + " dtype='object')" + ] + }, + "execution_count": 17, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum().sum()\n", + "data.select_dtypes(include='object').columns" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "qRXHdirxzfht" + }, + "outputs": [], + "source": [ + "def dummies(d):\n", + " dummies_df=pd.DataFrame()\n", + " for name in d.select_dtypes(include='object').columns:\n", + " dummies = pd.get_dummies(d[name], drop_first=False)\n", + " dummies = dummies.add_prefix(\"{}_\".format(name))\n", + " dummies_df=pd.concat([dummies_df,dummies],axis=1)\n", + " return dummies_df" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0Na_mDS7zfeh" + }, + "outputs": [], + "source": [ + "dummies_data=dummies(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 874, + "status": "ok", + "timestamp": 1565546218090, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "AlNSIVCSzfb6", + "outputId": "be7ffef4-2d46-42cc-f633-79d4671f1a3b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(2919, 273)" + ] + }, + "execution_count": 20, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "dummies_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2C6kkoFxztto" + }, + "outputs": [], + "source": [ + "data['MSSubClass'] = data['MSSubClass'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kmeHw_vmztqe" + }, + "outputs": [], + "source": [ + "MSSubClass_dummmies=pd.get_dummies(data['MSSubClass'], drop_first=False).add_prefix(\"MSSubClass_\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "un9snRwvztnN" + }, + "outputs": [], + "source": [ + "object_features=['MSSubClass','MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',\n", + " 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',\n", + " 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',\n", + " 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',\n", + " 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',\n", + " 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',\n", + " 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',\n", + " 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',\n", + " 'SaleType', 'SaleCondition']" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bm1VrJRwztjb" + }, + "outputs": [], + "source": [ + "data_drop=data.drop(columns=object_features,axis=1)\n", + "data_drop.shape\n", + "data_drop.columns\n", + "from scipy import stats\n", + "from scipy.stats import pointbiserialr, spearmanr, skew, pearsonr\n", + "\n", + "for col in data_drop.columns:\n", + " if skew(data_drop[col]) > 0.75:\n", + " data_drop[col] = np.log1p(data_drop[col])\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1037, + "status": "ok", + "timestamp": 1565546230598, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "1tlDQjGKzthK", + "outputId": "910d2ad0-ce20-468f-ec95-7fc2f283c062" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1460, 323)\n" + ] + }, + { + "data": { + "text/plain": [ + "(1459, 323)" + ] + }, + "execution_count": 25, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "final_data=pd.concat([MSSubClass_dummmies,data_drop,dummies_data],axis=1)\n", + "final_data.shape\n", + "#Re-spliting the data into train and test data set and ignore the Id column\n", + "train_df=final_data.iloc[:1460,1:]\n", + "test_data=final_data.iloc[1460:,1:]\n", + "print(train_df.shape)\n", + "test_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gJCWpNTGztdJ" + }, + "outputs": [], + "source": [ + "#Concatenating the train_df and the target variable from the original train data set \n", + "train_data=pd.concat([train_df,train.iloc[:,-1]],axis=1)\n", + "#train_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1038, + "status": "ok", + "timestamp": 1565546237745, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "qOxiICV00M7Z", + "outputId": "0a19608a-ba8d-4bc4-c450-1fe84a5baa87" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'from sklearn.model_selection import cross_val_score, GridSearchCV\\nfrom sklearn.ensemble import RandomForestRegressor\\n#from sklearn.preprocessing import MinMaxScaler'" + ] + }, + "execution_count": 27, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"from sklearn.model_selection import cross_val_score, GridSearchCV\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "#from sklearn.preprocessing import MinMaxScaler\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "GWF0pnZu0M5P" + }, + "outputs": [], + "source": [ + "#spliting the train_data into X: independent variables and y: target variable\n", + "X=train_data.iloc[:,:-1]\n", + "y=train_data.iloc[:,-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "qxd5yDLF0M0i" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge, RidgeCV, LassoCV" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1051, + "status": "ok", + "timestamp": 1565546246313, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "XsLJYmIk0MxE", + "outputId": "c517e860-8888-4f55-bafd-83c9c90b715c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))\\nlas_cv.fit(X,y)\\nlas_cv_preds=las_cv.predict(test_data)'" + ] + }, + "execution_count": 30, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))\n", + "las_cv.fit(X,y)\n", + "las_cv_preds=las_cv.predict(test_data)\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yPR6Q8qD0YBw" + }, + "outputs": [], + "source": [ + "ridge_cv = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))\n", + "ridge_cv.fit(X, y)\n", + "ridge_cv_preds=ridge_cv.predict(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "xAHUoN4x0X9_" + }, + "outputs": [], + "source": [ + "\"\"\"ridge = Ridge(alpha=10, solver='auto')\n", + "ridge.fit(X, y)\n", + "predictions=ridge.predict(test_data)\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 4895, + "status": "ok", + "timestamp": 1565546256060, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "5T9MW3gW0X6i", + "outputId": "d9ba6247-7ab8-4f66-8082-b28465ff48f0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[17:57:35] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n" + ] + }, + { + "data": { + "text/plain": [ + "array([117347.30588133, 162752.2398527 , 185864.73163177, 190148.26587855,\n", + " 202151.42710657])" + ] + }, + "execution_count": 32, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "import xgboost as xgb\n", + "model_xgb = xgb.XGBRegressor(n_estimators=340, max_depth=2, learning_rate=0.2)\n", + "model_xgb.fit(X, y)\n", + "xgb_preds=model_xgb.predict(test_data)\n", + "predictions = 0.5 * ridge_cv_preds + 0.5 * xgb_preds\n", + "#display the first 5 predictions of sale price\n", + "predictions[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1086, + "status": "ok", + "timestamp": 1565546260682, + "user": { + "displayName": "RJ13 PINDU", + "photoUrl": "https://lh3.googleusercontent.com/-xZi5buREJh8/AAAAAAAAAAI/AAAAAAAACVc/plX37CKDbxc/s64/photo.jpg", + "userId": "06664894662100164987" + }, + "user_tz": -330 + }, + "id": "MTKcGx9Q0X3T", + "outputId": "24ef82ba-1e58-4892-9d15-d4970148ab6d" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSalePrice
01461117347.305881
11462162752.239853
21463185864.731632
31464190148.265879
41465202151.427107
\n", + "
" + ], + "text/plain": [ + " Id SalePrice\n", + "0 1461 117347.305881\n", + "1 1462 162752.239853\n", + "2 1463 185864.731632\n", + "3 1464 190148.265879\n", + "4 1465 202151.427107" + ] + }, + "execution_count": 33, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#make the submission data frame\n", + "submission = {\n", + " 'Id': test.Id.values,\n", + " 'SalePrice': predictions\n", + "}\n", + "solution = pd.DataFrame(submission)\n", + "solution.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lpCpFe2x0Xze" + }, + "outputs": [], + "source": [ + "#make the submission file\n", + "solution.to_csv('submission.csv',index=False)" + ] + } + ], + "metadata": { + "colab": { + "name": "house.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/house_predict_report.pdf b/house_predict_report.pdf new file mode 100644 index 000000000..715ba683d Binary files /dev/null and b/house_predict_report.pdf differ