diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb
index 0102ef94..9a022c26 100644
--- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb
+++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb
@@ -12,11 +12,15 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
- "# Import your libraries:\n"
+ "# Import your libraries:\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import sklearn\n",
+ "from sklearn import datasets"
]
},
{
@@ -37,11 +41,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "diabetes = sklearn.datasets.load_diabetes(return_X_y=False)"
]
},
{
@@ -53,11 +58,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "diabetes.keys()"
]
},
{
@@ -73,13 +90,59 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"scrolled": false
},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ".. _diabetes_dataset:\n",
+ "\n",
+ "Diabetes dataset\n",
+ "----------------\n",
+ "\n",
+ "Ten baseline variables, age, sex, body mass index, average blood\n",
+ "pressure, and six blood serum measurements were obtained for each of n =\n",
+ "442 diabetes patients, as well as the response of interest, a\n",
+ "quantitative measure of disease progression one year after baseline.\n",
+ "\n",
+ "**Data Set Characteristics:**\n",
+ "\n",
+ " :Number of Instances: 442\n",
+ "\n",
+ " :Number of Attributes: First 10 columns are numeric predictive values\n",
+ "\n",
+ " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n",
+ "\n",
+ " :Attribute Information:\n",
+ " - Age\n",
+ " - Sex\n",
+ " - Body mass index\n",
+ " - Average blood pressure\n",
+ " - S1\n",
+ " - S2\n",
+ " - S3\n",
+ " - S4\n",
+ " - S5\n",
+ " - S6\n",
+ "\n",
+ "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n",
+ "\n",
+ "Source URL:\n",
+ "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n",
+ "\n",
+ "For more information see:\n",
+ "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n",
+ "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "print(diabetes['DESCR'])"
]
},
{
@@ -101,7 +164,10 @@
"metadata": {},
"outputs": [],
"source": [
- "# Enter your answer here:\n"
+ "# Enter your answer here:\n",
+ "#1.There are 10 attributes that are demographic characteristics of each individual and also 6 blood test from each of them\n",
+ "#2. Diabetes['data'] are the attributes, the variables that determine the target variable which in this case is diabetes['target'] that is the progression of each patient\n",
+ "#3. There are 442 patients, and each of them has 10 attributes and the target variables, this makes a total number of records of 442x11=4862\n"
]
},
{
@@ -115,11 +181,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(442, 10)\n",
+ "(442,)\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "print(diabetes['data'].shape)\n",
+ "print(diabetes['target'].shape)"
]
},
{
@@ -156,11 +233,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "from sklearn.linear_model import LinearRegression"
]
},
{
@@ -172,11 +250,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "diabetes_model = linear_model.LinearRegression()"
]
},
{
@@ -190,11 +269,16 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "diabetes_data, diabetes_target = datasets.load_diabetes(return_X_y=True)\n",
+ "diabetes_data_train = diabetes_X[:-20]\n",
+ "diabetes_data_test = diabetes_X[-20:]\n",
+ "diabetes_target_train = diabetes_y[:-20]\n",
+ "diabetes_target_test = diabetes_y[-20:]"
]
},
{
@@ -206,11 +290,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "152.76430691633442\n",
+ "[ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n",
+ " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n",
+ " 7.43519617e+02 7.60951722e+01]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n",
+ "\n",
+ "print(diabetes_model.intercept_)\n",
+ "print(diabetes_model.coef_)"
]
},
{
@@ -231,11 +330,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 34,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[197.61846908 155.43979328 172.88665147 111.53537279 164.80054784\n",
+ " 131.06954875 259.12237761 100.47935157 117.0601052 124.30503555\n",
+ " 218.36632793 61.19831284 132.25046751 120.3332925 52.54458691\n",
+ " 194.03798088 102.57139702 123.56604987 211.0346317 52.60335674]\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "diabetes_y_pred = diabetes_model.predict(diabetes_data_test)\n",
+ "print(diabetes_y_pred)"
]
},
{
@@ -247,11 +359,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 33,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[233. 91. 111. 152. 120. 67. 310. 94. 183. 66. 173. 72. 49. 64.\n",
+ " 48. 178. 104. 132. 220. 57.]\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "print(diabetes_target_test)"
]
},
{
@@ -267,7 +389,8 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your explanation here:\n"
+ "# Your explanation here:\n",
+ "#No, it is not the same because the target test are the real data solutions and the predictions are just done as approximations based on the traning data\n"
]
},
{
@@ -351,11 +474,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "auto = pd.read_csv('../auto-mpg.csv')"
]
},
{
@@ -367,11 +491,124 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mpg | \n",
+ " cylinders | \n",
+ " displacement | \n",
+ " horse_power | \n",
+ " weight | \n",
+ " acceleration | \n",
+ " model_year | \n",
+ " car_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 18.0 | \n",
+ " 8 | \n",
+ " 307.0 | \n",
+ " 130.0 | \n",
+ " 3504 | \n",
+ " 12.0 | \n",
+ " 70 | \n",
+ " \\t\"chevrolet chevelle malibu\" | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 15.0 | \n",
+ " 8 | \n",
+ " 350.0 | \n",
+ " 165.0 | \n",
+ " 3693 | \n",
+ " 11.5 | \n",
+ " 70 | \n",
+ " \\t\"buick skylark 320\" | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 18.0 | \n",
+ " 8 | \n",
+ " 318.0 | \n",
+ " 150.0 | \n",
+ " 3436 | \n",
+ " 11.0 | \n",
+ " 70 | \n",
+ " \\t\"plymouth satellite\" | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 16.0 | \n",
+ " 8 | \n",
+ " 304.0 | \n",
+ " 150.0 | \n",
+ " 3433 | \n",
+ " 12.0 | \n",
+ " 70 | \n",
+ " \\t\"amc rebel sst\" | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 17.0 | \n",
+ " 8 | \n",
+ " 302.0 | \n",
+ " 140.0 | \n",
+ " 3449 | \n",
+ " 10.5 | \n",
+ " 70 | \n",
+ " \\t\"ford torino\" | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mpg cylinders displacement horse_power weight acceleration \\\n",
+ "0 18.0 8 307.0 130.0 3504 12.0 \n",
+ "1 15.0 8 350.0 165.0 3693 11.5 \n",
+ "2 18.0 8 318.0 150.0 3436 11.0 \n",
+ "3 16.0 8 304.0 150.0 3433 12.0 \n",
+ "4 17.0 8 302.0 140.0 3449 10.5 \n",
+ "\n",
+ " model_year car_name \n",
+ "0 70 \\t\"chevrolet chevelle malibu\" \n",
+ "1 70 \\t\"buick skylark 320\" \n",
+ "2 70 \\t\"plymouth satellite\" \n",
+ "3 70 \\t\"amc rebel sst\" \n",
+ "4 70 \\t\"ford torino\" "
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "auto.head(5)"
]
},
{
@@ -383,11 +620,32 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 398 entries, 0 to 397\n",
+ "Data columns (total 8 columns):\n",
+ "mpg 398 non-null float64\n",
+ "cylinders 398 non-null int64\n",
+ "displacement 398 non-null float64\n",
+ "horse_power 392 non-null float64\n",
+ "weight 398 non-null int64\n",
+ "acceleration 398 non-null float64\n",
+ "model_year 398 non-null int64\n",
+ "car_name 398 non-null object\n",
+ "dtypes: float64(4), int64(3), object(1)\n",
+ "memory usage: 25.0+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "auto.info()"
]
},
{
@@ -399,11 +657,155 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mpg | \n",
+ " cylinders | \n",
+ " displacement | \n",
+ " horse_power | \n",
+ " weight | \n",
+ " acceleration | \n",
+ " model_year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 398.000000 | \n",
+ " 398.000000 | \n",
+ " 398.000000 | \n",
+ " 392.000000 | \n",
+ " 398.000000 | \n",
+ " 398.000000 | \n",
+ " 398.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 23.514573 | \n",
+ " 5.454774 | \n",
+ " 193.425879 | \n",
+ " 104.469388 | \n",
+ " 2970.424623 | \n",
+ " 15.568090 | \n",
+ " 76.010050 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 7.815984 | \n",
+ " 1.701004 | \n",
+ " 104.269838 | \n",
+ " 38.491160 | \n",
+ " 846.841774 | \n",
+ " 2.757689 | \n",
+ " 3.697627 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 9.000000 | \n",
+ " 3.000000 | \n",
+ " 68.000000 | \n",
+ " 46.000000 | \n",
+ " 1613.000000 | \n",
+ " 8.000000 | \n",
+ " 70.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 17.500000 | \n",
+ " 4.000000 | \n",
+ " 104.250000 | \n",
+ " 75.000000 | \n",
+ " 2223.750000 | \n",
+ " 13.825000 | \n",
+ " 73.000000 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 23.000000 | \n",
+ " 4.000000 | \n",
+ " 148.500000 | \n",
+ " 93.500000 | \n",
+ " 2803.500000 | \n",
+ " 15.500000 | \n",
+ " 76.000000 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 29.000000 | \n",
+ " 8.000000 | \n",
+ " 262.000000 | \n",
+ " 126.000000 | \n",
+ " 3608.000000 | \n",
+ " 17.175000 | \n",
+ " 79.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 46.600000 | \n",
+ " 8.000000 | \n",
+ " 455.000000 | \n",
+ " 230.000000 | \n",
+ " 5140.000000 | \n",
+ " 24.800000 | \n",
+ " 82.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mpg cylinders displacement horse_power weight \\\n",
+ "count 398.000000 398.000000 398.000000 392.000000 398.000000 \n",
+ "mean 23.514573 5.454774 193.425879 104.469388 2970.424623 \n",
+ "std 7.815984 1.701004 104.269838 38.491160 846.841774 \n",
+ "min 9.000000 3.000000 68.000000 46.000000 1613.000000 \n",
+ "25% 17.500000 4.000000 104.250000 75.000000 2223.750000 \n",
+ "50% 23.000000 4.000000 148.500000 93.500000 2803.500000 \n",
+ "75% 29.000000 8.000000 262.000000 126.000000 3608.000000 \n",
+ "max 46.600000 8.000000 455.000000 230.000000 5140.000000 \n",
+ "\n",
+ " acceleration model_year \n",
+ "count 398.000000 398.000000 \n",
+ "mean 15.568090 76.010050 \n",
+ "std 2.757689 3.697627 \n",
+ "min 8.000000 70.000000 \n",
+ "25% 13.825000 73.000000 \n",
+ "50% 15.500000 76.000000 \n",
+ "75% 17.175000 79.000000 \n",
+ "max 24.800000 82.000000 "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "auto.describe()\n",
+ "#the newest model year is 82 and the oldest 70"
]
},
{
@@ -415,11 +817,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "auto.isnull().sum()\n",
+ "auto.dropna(inplace=True)"
]
},
{
@@ -431,11 +835,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 50,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([8, 4, 6, 3, 5])"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "auto['cylinders'].unique()"
]
},
{
@@ -455,7 +871,22 @@
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "auto.drop('car_name', axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Your code here:\n",
+ "target = auto['mpg']\n",
+ "attributes = ['cylinders', 'displacement', 'horse_power', 'weight', 'acceleration']\n",
+ "data = auto[attributes]\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 0)"
]
},
{
@@ -469,11 +900,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 61,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "auto_model = linear_model.LinearRegression()\n",
+ "auto_model.fit(X_train, y_train)"
]
},
{
@@ -502,11 +946,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.7097139425798664"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "y_pred = auto_model.predict(X_train)\n",
+ "\n",
+ "from sklearn.metrics import r2_score\n",
+ "r2_score(y_train, y_pred)"
]
},
{
@@ -522,11 +981,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6942573567797339"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "y_test_pred = auto_model.predict(X_test)\n",
+ "\n",
+ "r2_score(y_test, y_test_pred)"
]
},
{
@@ -551,11 +1024,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train09, X_test09, y_train09, y_test09 = train_test_split(data, target, test_size = 0.1, random_state = 0)"
]
},
{
@@ -567,11 +1042,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 71,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here:\n"
+ "# Your code here:\n",
+ "auto_model09 = linear_model.LinearRegression()\n",
+ "auto_model09.fit(X_train09, y_train09)"
]
},
{
@@ -583,11 +1071,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.711486921026971"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "y_pred09 = auto_model.predict(X_train09)\n",
+ "\n",
+ "r2_score(y_train09, y_pred09)"
]
},
{
@@ -599,11 +1101,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Your code here:\n"
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6607860679011375"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Your code here:\n",
+ "y_test_pred09 = auto_model.predict(X_test09)\n",
+ "\n",
+ "r2_score(y_test09, y_test_pred09)\n",
+ "\n",
+ "#there's no improvement in the r squared"
]
},
{
@@ -726,7 +1244,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.6"
+ "version": "3.7.5"
}
},
"nbformat": 4,