From ab5fca01cf605769b85148fef3f540f0c17de9ed Mon Sep 17 00:00:00 2001 From: "juliafroch@gmail.com" Date: Fri, 10 Apr 2020 21:14:11 +0200 Subject: [PATCH] done --- .../your-code/main.ipynb | 688 +++++++++++++++--- 1 file changed, 603 insertions(+), 85 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..9a022c26 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn\n", + "from sklearn import datasets" ] }, { @@ -37,11 +41,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes = sklearn.datasets.load_diabetes(return_X_y=False)" ] }, { @@ -53,11 +58,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +90,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes['DESCR'])" ] }, { @@ -101,7 +164,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "#1.There are 10 attributes that are demographic characteristics of each individual and also 6 blood test from each of them\n", + "#2. Diabetes['data'] are the attributes, the variables that determine the target variable which in this case is diabetes['target'] that is the progression of each patient\n", + "#3. There are 442 patients, and each of them has 10 attributes and the target variables, this makes a total number of records of 442x11=4862\n" ] }, { @@ -115,11 +181,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10)\n", + "(442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes['data'].shape)\n", + "print(diabetes['target'].shape)" ] }, { @@ -156,11 +233,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +250,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = linear_model.LinearRegression()" ] }, { @@ -190,11 +269,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_data, diabetes_target = datasets.load_diabetes(return_X_y=True)\n", + "diabetes_data_train = diabetes_X[:-20]\n", + "diabetes_data_test = diabetes_X[-20:]\n", + "diabetes_target_train = diabetes_y[:-20]\n", + "diabetes_target_test = diabetes_y[-20:]" ] }, { @@ -206,11 +290,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "152.76430691633442\n", + "[ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", + " 7.43519617e+02 7.60951722e+01]\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "\n", + "print(diabetes_model.intercept_)\n", + "print(diabetes_model.coef_)" ] }, { @@ -231,11 +330,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[197.61846908 155.43979328 172.88665147 111.53537279 164.80054784\n", + " 131.06954875 259.12237761 100.47935157 117.0601052 124.30503555\n", + " 218.36632793 61.19831284 132.25046751 120.3332925 52.54458691\n", + " 194.03798088 102.57139702 123.56604987 211.0346317 52.60335674]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_y_pred = diabetes_model.predict(diabetes_data_test)\n", + "print(diabetes_y_pred)" ] }, { @@ -247,11 +359,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[233. 91. 111. 152. 120. 67. 310. 94. 183. 66. 173. 72. 49. 64.\n", + " 48. 178. 104. 132. 220. 57.]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes_target_test)" ] }, { @@ -267,7 +389,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "#No, it is not the same because the target test are the real data solutions and the predictions are just done as approximations based on the traning data\n" ] }, { @@ -351,11 +474,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +491,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head(5)" ] }, { @@ -383,11 +620,32 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 398 entries, 0 to 397\n", + "Data columns (total 8 columns):\n", + "mpg 398 non-null float64\n", + "cylinders 398 non-null int64\n", + "displacement 398 non-null float64\n", + "horse_power 392 non-null float64\n", + "weight 398 non-null int64\n", + "acceleration 398 non-null float64\n", + "model_year 398 non-null int64\n", + "car_name 398 non-null object\n", + "dtypes: float64(4), int64(3), object(1)\n", + "memory usage: 25.0+ KB\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "auto.info()" ] }, { @@ -399,11 +657,155 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_year
count398.000000398.000000398.000000392.000000398.000000398.000000398.000000
mean23.5145735.454774193.425879104.4693882970.42462315.56809076.010050
std7.8159841.701004104.26983838.491160846.8417742.7576893.697627
min9.0000003.00000068.00000046.0000001613.0000008.00000070.000000
25%17.5000004.000000104.25000075.0000002223.75000013.82500073.000000
50%23.0000004.000000148.50000093.5000002803.50000015.50000076.000000
75%29.0000008.000000262.000000126.0000003608.00000017.17500079.000000
max46.6000008.000000455.000000230.0000005140.00000024.80000082.000000
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight \\\n", + "count 398.000000 398.000000 398.000000 392.000000 398.000000 \n", + "mean 23.514573 5.454774 193.425879 104.469388 2970.424623 \n", + "std 7.815984 1.701004 104.269838 38.491160 846.841774 \n", + "min 9.000000 3.000000 68.000000 46.000000 1613.000000 \n", + "25% 17.500000 4.000000 104.250000 75.000000 2223.750000 \n", + "50% 23.000000 4.000000 148.500000 93.500000 2803.500000 \n", + "75% 29.000000 8.000000 262.000000 126.000000 3608.000000 \n", + "max 46.600000 8.000000 455.000000 230.000000 5140.000000 \n", + "\n", + " acceleration model_year \n", + "count 398.000000 398.000000 \n", + "mean 15.568090 76.010050 \n", + "std 2.757689 3.697627 \n", + "min 8.000000 70.000000 \n", + "25% 13.825000 73.000000 \n", + "50% 15.500000 76.000000 \n", + "75% 17.175000 79.000000 \n", + "max 24.800000 82.000000 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.describe()\n", + "#the newest model year is 82 and the oldest 70" ] }, { @@ -415,11 +817,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.isnull().sum()\n", + "auto.dropna(inplace=True)" ] }, { @@ -431,11 +835,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([8, 4, 6, 3, 5])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto['cylinders'].unique()" ] }, { @@ -455,7 +871,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "auto.drop('car_name', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here:\n", + "target = auto['mpg']\n", + "attributes = ['cylinders', 'displacement', 'horse_power', 'weight', 'acceleration']\n", + "data = auto[attributes]\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 0)" ] }, { @@ -469,11 +900,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model = linear_model.LinearRegression()\n", + "auto_model.fit(X_train, y_train)" ] }, { @@ -502,11 +946,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7097139425798664" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_pred = auto_model.predict(X_train)\n", + "\n", + "from sklearn.metrics import r2_score\n", + "r2_score(y_train, y_pred)" ] }, { @@ -522,11 +981,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6942573567797339" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "\n", + "r2_score(y_test, y_test_pred)" ] }, { @@ -551,11 +1024,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.model_selection import train_test_split\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(data, target, test_size = 0.1, random_state = 0)" ] }, { @@ -567,11 +1042,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09 = linear_model.LinearRegression()\n", + "auto_model09.fit(X_train09, y_train09)" ] }, { @@ -583,11 +1071,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.711486921026971" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_pred09 = auto_model.predict(X_train09)\n", + "\n", + "r2_score(y_train09, y_pred09)" ] }, { @@ -599,11 +1101,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6607860679011375" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test_pred09 = auto_model.predict(X_test09)\n", + "\n", + "r2_score(y_test09, y_test_pred09)\n", + "\n", + "#there's no improvement in the r squared" ] }, { @@ -726,7 +1244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.5" } }, "nbformat": 4,