diff --git a/module-3/lab-supervised-learning-sklearn/push b/module-3/lab-supervised-learning-sklearn/push new file mode 100644 index 00000000..aa981746 --- /dev/null +++ b/module-3/lab-supervised-learning-sklearn/push @@ -0,0 +1 @@ +fsfa diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..4d91cefd 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn import datasets" ] }, { @@ -37,11 +39,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes = datasets.load_diabetes(return_X_y=False)" ] }, { @@ -53,11 +56,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +88,57 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print (diabetes.DESCR)" ] }, { @@ -97,11 +156,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "# How many attributes are there in the data? What do they mean?\n", + " #There are 10 atributes, all of them are from each individual (agex, sex, bmi, blood preassure and 6 other about blood)\n", + "# What is the relation between diabetes['data'] and diabetes['target']?\n", + " #Diabetes data is the 'description' of each individual, target measures how the diabetes has evolved since day zero.\n", + "# How many records are there in the data?\n", + " #There data from 442 individuals" ] }, { @@ -115,11 +180,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10) (442,)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.data.shape, diabetes.target.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "#Data has 442 registers and 10 columns, target has 442 registers and only 1 column. Exactly what was expected." ] }, { @@ -156,11 +239,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +256,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +275,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X = diabetes.data\n", + "y = diabetes.target\n", + "from sklearn.model_selection import train_test_split\n", + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(\n", + " X, y, test_size=0.042, random_state=42)" ] }, { @@ -206,11 +296,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept: 151.526250776629\n", + "Coefficients: [ 2.67659502 -257.98776342 539.75237886 339.16378235 -854.04450849\n", + " 477.72031381 147.45555947 252.27907163 716.32314001 63.57644276]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "print(f'Intercept: {diabetes_model.intercept_}')\n", + "print(f'Coefficients: {diabetes_model.coef_}')\n", + "\n" ] }, { @@ -231,11 +335,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([141.81216009, 176.62912663, 134.52300035, 287.7903017 ,\n", + " 122.88444183, 96.36277017, 257.66709041, 184.78962182,\n", + " 92.6144851 , 111.50621663, 98.34330547, 165.83614827,\n", + " 58.00056113, 205.43398993, 100.75616227, 130.5507428 ,\n", + " 218.81135251, 246.83179054, 193.45957561])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "diabetes_model.predict(diabetes_data_test)" ] }, { @@ -247,11 +367,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[219. 70. 202. 230. 111. 84. 242. 272. 94. 96. 94. 252. 99. 297.\n", + " 135. 67. 295. 264. 170.]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes_target_test)" ] }, { @@ -263,11 +393,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + " #not even close the prediction vs the reality, there are a lot of differences. The variables use for the predition are not \"very predictive\" as evolution of the diabetes" ] }, { @@ -302,11 +433,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "import numpy as np\n", + "import statsmodels.api as sm\n", + "\n", + "\n", + "#I AM SKIPPING THIS BONUS " ] }, { @@ -351,11 +487,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +504,144 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | mpg | \n", + "cylinders | \n", + "displacement | \n", + "horse_power | \n", + "weight | \n", + "acceleration | \n", + "model_year | \n", + "car_name | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "18.0 | \n", + "8 | \n", + "307.0 | \n", + "130.0 | \n", + "3504 | \n", + "12.0 | \n", + "70 | \n", + "\\t\"chevrolet chevelle malibu\" | \n", + "
| 1 | \n", + "15.0 | \n", + "8 | \n", + "350.0 | \n", + "165.0 | \n", + "3693 | \n", + "11.5 | \n", + "70 | \n", + "\\t\"buick skylark 320\" | \n", + "
| 2 | \n", + "18.0 | \n", + "8 | \n", + "318.0 | \n", + "150.0 | \n", + "3436 | \n", + "11.0 | \n", + "70 | \n", + "\\t\"plymouth satellite\" | \n", + "
| 3 | \n", + "16.0 | \n", + "8 | \n", + "304.0 | \n", + "150.0 | \n", + "3433 | \n", + "12.0 | \n", + "70 | \n", + "\\t\"amc rebel sst\" | \n", + "
| 4 | \n", + "17.0 | \n", + "8 | \n", + "302.0 | \n", + "140.0 | \n", + "3449 | \n", + "10.5 | \n", + "70 | \n", + "\\t\"ford torino\" | \n", + "