diff --git a/module-3/lab-supervised-learning-sklearn/push b/module-3/lab-supervised-learning-sklearn/push new file mode 100644 index 00000000..aa981746 --- /dev/null +++ b/module-3/lab-supervised-learning-sklearn/push @@ -0,0 +1 @@ +fsfa diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..4d91cefd 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn import datasets" ] }, { @@ -37,11 +39,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes = datasets.load_diabetes(return_X_y=False)" ] }, { @@ -53,11 +56,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +88,57 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print (diabetes.DESCR)" ] }, { @@ -97,11 +156,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "# How many attributes are there in the data? What do they mean?\n", + " #There are 10 atributes, all of them are from each individual (agex, sex, bmi, blood preassure and 6 other about blood)\n", + "# What is the relation between diabetes['data'] and diabetes['target']?\n", + " #Diabetes data is the 'description' of each individual, target measures how the diabetes has evolved since day zero.\n", + "# How many records are there in the data?\n", + " #There data from 442 individuals" ] }, { @@ -115,11 +180,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10) (442,)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.data.shape, diabetes.target.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "#Data has 442 registers and 10 columns, target has 442 registers and only 1 column. Exactly what was expected." ] }, { @@ -156,11 +239,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +256,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +275,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X = diabetes.data\n", + "y = diabetes.target\n", + "from sklearn.model_selection import train_test_split\n", + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(\n", + " X, y, test_size=0.042, random_state=42)" ] }, { @@ -206,11 +296,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept: 151.526250776629\n", + "Coefficients: [ 2.67659502 -257.98776342 539.75237886 339.16378235 -854.04450849\n", + " 477.72031381 147.45555947 252.27907163 716.32314001 63.57644276]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "print(f'Intercept: {diabetes_model.intercept_}')\n", + "print(f'Coefficients: {diabetes_model.coef_}')\n", + "\n" ] }, { @@ -231,11 +335,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([141.81216009, 176.62912663, 134.52300035, 287.7903017 ,\n", + " 122.88444183, 96.36277017, 257.66709041, 184.78962182,\n", + " 92.6144851 , 111.50621663, 98.34330547, 165.83614827,\n", + " 58.00056113, 205.43398993, 100.75616227, 130.5507428 ,\n", + " 218.81135251, 246.83179054, 193.45957561])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "diabetes_model.predict(diabetes_data_test)" ] }, { @@ -247,11 +367,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[219. 70. 202. 230. 111. 84. 242. 272. 94. 96. 94. 252. 99. 297.\n", + " 135. 67. 295. 264. 170.]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes_target_test)" ] }, { @@ -263,11 +393,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + " #not even close the prediction vs the reality, there are a lot of differences. The variables use for the predition are not \"very predictive\" as evolution of the diabetes" ] }, { @@ -302,11 +433,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "import numpy as np\n", + "import statsmodels.api as sm\n", + "\n", + "\n", + "#I AM SKIPPING THIS BONUS " ] }, { @@ -351,11 +487,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +504,144 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(398, 8)" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "auto.shape" ] }, { @@ -383,11 +653,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg float64\n", + "cylinders int64\n", + "displacement float64\n", + "horse_power float64\n", + "weight int64\n", + "acceleration float64\n", + "model_year int64\n", + "car_name object\n", + "dtype: object" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.dtypes" ] }, { @@ -399,11 +689,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 398.000000\n", + "mean 76.010050\n", + "std 3.697627\n", + "min 70.000000\n", + "25% 73.000000\n", + "50% 76.000000\n", + "75% 79.000000\n", + "max 82.000000\n", + "Name: model_year, dtype: float64" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto['model_year'].describe()" ] }, { @@ -415,11 +725,90 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "#after checking there is only column \"horse_power\" with missing values. I proceed to delete all rows with missing values\n", + "auto = auto.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 0\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(392, 8)" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto.shape\n", + "#There were 398 rows at the beggining, after dropping the na rows, there are only 392" ] }, { @@ -431,11 +820,29 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto['cylinders'].value_counts()\n", + "#there are 5 possible values" ] }, { @@ -451,11 +858,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "#Dropping the \"car_name\" column\n", + "auto = auto.loc[:, auto.columns != 'car_name']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "#Creating X , y will use for the model\n", + "y = auto['mpg']\n", + "X = auto.loc[:, auto.columns != 'mpg']" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2)" ] }, { @@ -469,11 +899,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 124, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model = LinearRegression()\n", + "auto_model.fit(X_train, y_train)" ] }, { @@ -502,11 +945,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8121425247115885" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "from sklearn.metrics import r2_score\n", + "\n", + "y_pred = auto_model.predict(X_train)\n", + "\n", + "r2_score(y_train, y_pred)\n" ] }, { @@ -522,11 +981,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 126, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7958650048990128" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r2_score(y_test, y_test_pred)\n" ] }, { @@ -538,6 +1010,15 @@ "The r squared scores of the training data and the test data are pretty close (0.8146 vs 0.7818). This means our model is not overfitted. However, there is still room to improve the model fit. Move on to the next challenge." ] }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "# I do not get the same results but it's true they are pretty close." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -551,11 +1032,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(\n", + " X, y, test_size=0.1)" ] }, { @@ -567,11 +1050,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09 = LinearRegression()" ] }, { @@ -583,11 +1067,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 130, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8043919746446471" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09.fit(X_train09, y_train09)\n", + "y_train_predict09 = auto_model.predict(X_train09)\n", + "r2_score (y_train09, y_train_predict09)" ] }, { @@ -599,11 +1097,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8275315374679679" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_predict09 = auto_model.predict(X_test09)\n", + "r2_score (y_test09 , y_test_predict09)" ] }, { @@ -712,9 +1223,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:data_env]", "language": "python", - "name": "python3" + "name": "conda-env-data_env-py" }, "language_info": { "codemirror_mode": { @@ -726,9 +1237,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }