From 2f90d27ad7662f31d241123909ec45101424ca80 Mon Sep 17 00:00:00 2001 From: vkamg Date: Sun, 12 Apr 2020 23:30:55 +0200 Subject: [PATCH 1/3] working on challenge 2 --- .../your-code/main.ipynb | 147 ++++++++++++++---- 1 file changed, 117 insertions(+), 30 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..8f1ac815 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import sklearn\n", + "from sklearn import datasets" ] }, { @@ -37,11 +39,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "diabetes = datasets.load_diabetes()" ] }, { @@ -53,11 +57,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +89,57 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.DESCR)" ] }, { @@ -96,12 +156,21 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "1. How many attributes are there in the data? What do they mean?\n", + " - There are 10 attributes: age, sex, body mass index, average blood\n", + " pressure, and six blood serum measurements (S1, S2, S3, S4, S5 and S6)\n", + "\n", + "2. What is the relation between diabetes['data'] and diabetes['target']?\n", + "\n", + " - The diabetes['data'] is the baseline variables and the diabtes['target'] is a quantitative measure of disease progression one year after baseline.\n", + "\n", + "3. How many records are there in the data?\n", + "\n", + " - 442 (n = 442 diabetes patients)" ] }, { @@ -115,11 +184,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The shape of data is (442, 10) and the shape of target is (442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "data_shape = diabetes['data'].shape\n", + "target_shape = diabetes['target'].shape\n", + "\n", + "print(f\"The shape of data is {data_shape} and the shape of target is {target_shape}\")" ] }, { @@ -156,11 +238,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +255,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "diabetes_model = LinearRegression()" ] }, { @@ -194,7 +279,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "\n" ] }, { @@ -712,9 +799,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:data_env]", "language": "python", - "name": "python3" + "name": "conda-env-data_env-py" }, "language_info": { "codemirror_mode": { @@ -726,9 +813,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 7e2e6ca9cb3d2a20124ad1dac79deb1ca06b8086 Mon Sep 17 00:00:00 2001 From: vkamg Date: Mon, 13 Apr 2020 01:25:26 +0200 Subject: [PATCH 2/3] working on challenge 3 --- .../your-code/main.ipynb | 365 ++++++++++++++++-- 1 file changed, 331 insertions(+), 34 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 8f1ac815..8cadb183 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -275,13 +275,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", "\n", - "\n" + "diabetes_data_train = diabetes['data'][0:-20]\n", + "\n", + "diabetes_target_train = diabetes['target'][0:-20]\n", + "\n", + "diabetes_data_test = diabetes['data'][-20:]\n", + "\n", + "diabetes_target_test = diabetes['target'][-20:]" ] }, { @@ -293,11 +299,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coeficients are: [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", + " 7.43519617e+02 7.60951722e+01]\n", + "The intercept is: 152.76430691633442\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "X = diabetes_data_train\n", + "y = diabetes_target_train\n", + "\n", + "diabetes_model = LinearRegression().fit(X, y)\n", + "\n", + "print(f'Coeficients are: {diabetes_model.coef_}')\n", + "print(f'The intercept is: {diabetes_model.intercept_}')" ] }, { @@ -318,11 +343,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[197.61846908 155.43979328 172.88665147 111.53537279 164.80054784\n", + " 131.06954875 259.12237761 100.47935157 117.0601052 124.30503555\n", + " 218.36632793 61.19831284 132.25046751 120.3332925 52.54458691\n", + " 194.03798088 102.57139702 123.56604987 211.0346317 52.60335674]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "predictions = diabetes_model.predict(diabetes_data_test)\n", + "\n", + "print(predictions)" ] }, { @@ -334,11 +374,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[233. 91. 111. 152. 120. 67. 310. 94. 183. 66. 173. 72. 49. 64.\n", + " 48. 178. 104. 132. 220. 57.]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes_target_test)" ] }, { @@ -349,12 +399,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "No, is not the same. What the linear regression algorithm does is it fits multiple lines on the data points and returns the line that results in the least error nut there is always an error." ] }, { @@ -389,11 +438,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "=======================================================================================\n", + "Dep. Variable: y R-squared (uncentered): 0.110\n", + "Model: OLS Adj. R-squared (uncentered): 0.089\n", + "Method: Least Squares F-statistic: 5.109\n", + "Date: Mon, 13 Apr 2020 Prob (F-statistic): 4.77e-07\n", + "Time: 00:57:01 Log-Likelihood: -2745.5\n", + "No. Observations: 422 AIC: 5511.\n", + "Df Residuals: 412 BIC: 5552.\n", + "Df Model: 10 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "x1 42.9190 183.983 0.233 0.816 -318.744 404.582\n", + "x2 -261.9605 188.650 -1.389 0.166 -632.798 108.877\n", + "x3 547.5378 204.613 2.676 0.008 145.322 949.754\n", + "x4 352.4704 200.777 1.756 0.080 -42.205 747.146\n", + "x5 -634.0265 1273.063 -0.498 0.619 -3136.536 1868.483\n", + "x6 285.1002 1033.408 0.276 0.783 -1746.310 2316.510\n", + "x7 -9.4062 658.863 -0.014 0.989 -1304.558 1285.746\n", + "x8 197.4998 502.388 0.393 0.694 -790.064 1185.063\n", + "x9 670.7500 526.463 1.274 0.203 -364.139 1705.639\n", + "x10 11.6643 205.008 0.057 0.955 -391.327 414.656\n", + "==============================================================================\n", + "Omnibus: 0.574 Durbin-Watson: 0.228\n", + "Prob(Omnibus): 0.751 Jarque-Bera (JB): 0.677\n", + "Skew: -0.001 Prob(JB): 0.713\n", + "Kurtosis: 2.804 Cond. No. 21.4\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "import statsmodels.api as sm\n", + "\n", + "mod = sm.OLS(diabetes_target_train, diabetes_data_train)\n", + "\n", + "res = mod.fit()\n", + "\n", + "print(res.summary())" ] }, { @@ -438,11 +535,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "import pandas as pd\n", + "\n", + "\n", + "auto = pd.read_csv(\"../auto-mpg.csv\")" ] }, { @@ -454,11 +555,124 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.head()" ] }, { @@ -470,11 +684,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 398 entries, 0 to 397\n", + "Data columns (total 8 columns):\n", + "mpg 398 non-null float64\n", + "cylinders 398 non-null int64\n", + "displacement 398 non-null float64\n", + "horse_power 392 non-null float64\n", + "weight 398 non-null int64\n", + "acceleration 398 non-null float64\n", + "model_year 398 non-null int64\n", + "car_name 398 non-null object\n", + "dtypes: float64(4), int64(3), object(1)\n", + "memory usage: 25.0+ KB\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.info()" ] }, { @@ -486,11 +721,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The newest model year is 82 and the oldest model year is 70\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "newest_model_year = auto[\"model_year\"].max()\n", + "\n", + "oldest_model_year = auto[\"model_year\"].min()\n", + "\n", + "print(f'The newest model year is {newest_model_year} and the oldest model year is {oldest_model_year}')" ] }, { @@ -502,11 +750,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "auto.dropna(inplace=True)" ] }, { @@ -518,11 +796,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto[\"cylinders\"].value_counts()\n", + "\n", + "#There are 5 possible values of cylinders." ] }, { From 20568d4abdc545a4c761a41321acadff0f460184 Mon Sep 17 00:00:00 2001 From: vkamg Date: Mon, 13 Apr 2020 20:21:50 +0200 Subject: [PATCH 3/3] lab finished --- .../your-code/main.ipynb | 273 ++++++++++++++---- 1 file changed, 217 insertions(+), 56 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 8cadb183..df7a5d51 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -66,7 +66,7 @@ "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -184,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -238,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -255,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -275,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -299,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -343,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -374,7 +374,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -451,7 +451,7 @@ "Model: OLS Adj. R-squared (uncentered): 0.089\n", "Method: Least Squares F-statistic: 5.109\n", "Date: Mon, 13 Apr 2020 Prob (F-statistic): 4.77e-07\n", - "Time: 00:57:01 Log-Likelihood: -2745.5\n", + "Time: 19:25:47 Log-Likelihood: -2745.5\n", "No. Observations: 422 AIC: 5511.\n", "Df Residuals: 412 BIC: 5552.\n", "Df Model: 10 \n", @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -535,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -555,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -665,7 +665,7 @@ "4 70 \\t\"ford torino\" " ] }, - "execution_count": 37, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -684,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -721,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -750,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -767,7 +767,7 @@ "dtype: int64" ] }, - "execution_count": 40, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -780,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -796,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -810,7 +810,7 @@ "Name: cylinders, dtype: int64" ] }, - "execution_count": 43, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -822,6 +822,28 @@ "#There are 5 possible values of cylinders." ] }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['mpg', 'cylinders', 'displacement', 'horse_power', 'weight',\n", + " 'acceleration', 'model_year', 'car_name'],\n", + " dtype='object')" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto.columns" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -835,11 +857,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "auto.drop(['car_name'], axis=1, inplace=True)\n", + "\n", + "\n", + "X = auto[['cylinders', 'displacement', 'horse_power', 'weight',\n", + " 'acceleration', 'model_year']].values\n", + "y = auto['mpg'].values\n", + "\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)" ] }, { @@ -853,11 +888,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "auto_model = LinearRegression()\n", + "\n", + "X = X_train\n", + "y = y_train\n", + "\n", + "auto_model = LinearRegression().fit(X, y)" ] }, { @@ -886,11 +928,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8124650406575946" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "from sklearn.metrics import r2_score\n", + "\n", + "y_pred = auto_model.predict(X_train)\n", + "\n", + "r2_score(y_train, y_pred)" ] }, { @@ -906,11 +965,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7956309524849701" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "y_test_pred = auto_model.predict(X_test)\n", + "\n", + "r2_score(y_test, y_test_pred)" ] }, { @@ -935,11 +1009,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "X = auto[['cylinders', 'displacement', 'horse_power', 'weight',\n", + " 'acceleration', 'model_year']].values\n", + "y = auto['mpg'].values\n", + "\n", + "\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(X, y, test_size=0.10)\n", + "\n" ] }, { @@ -951,11 +1033,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "auto_model09 = LinearRegression()\n", + "\n", + "X = X_train09\n", + "y = y_train09\n", + "\n", + "auto_model09 = LinearRegression().fit(X, y)" ] }, { @@ -967,11 +1056,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8003012594878844" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "y_pred09 = auto_model.predict(X_train09)\n", + "\n", + "r2_score(y_train09, y_pred09)" ] }, { @@ -983,11 +1087,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8785908709630285" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "y_test_pred09 = auto_model.predict(X_test09)\n", + "\n", + "r2_score(y_test09, y_test_pred09)" ] }, { @@ -1003,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1019,11 +1138,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "selector = RFE(auto_model, 3, step=1)" ] }, { @@ -1035,11 +1156,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 4, 3, 1, 1])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "X = auto[['cylinders', 'displacement', 'horse_power', 'weight',\n", + " 'acceleration', 'model_year']].values\n", + "y = auto['mpg'].values\n", + "\n", + "selector = selector.fit(X, y)\n", + "\n", + "selector.ranking_" ] }, { @@ -1053,11 +1193,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X, y, test_size=0.20)" ] }, { @@ -1069,11 +1211,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7478134390730498" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here: \n" + "# Your code here: \n", + "\n", + "auto_model_reduced = selector.fit(X_train_reduced, y_train_reduced)\n", + "\n", + "y_pred_reduced = auto_model_reduced.predict(X_test_reduced)\n", + "\n", + "r2_score(y_test_reduced, y_pred_reduced)\n", + "\n", + "#It didn't cause any improvement in the r squared score. I don't know if maybe I did something wrong." ] }, {