diff --git a/module-1/.gitignore b/module-1/.gitignore index b2e5ef3d..1fb4aecd 100644 --- a/module-1/.gitignore +++ b/module-1/.gitignore @@ -1,2 +1,3 @@ DS_Store -.ipynb_checkpoints \ No newline at end of file +.ipynb_checkpoints +.bash_profile diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..7387c37e 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn.datasets import load_diabetes" ] }, { @@ -37,11 +39,73 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'data': array([[ 0.03807591, 0.05068012, 0.06169621, ..., -0.00259226,\n", + " 0.01990842, -0.01764613],\n", + " [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,\n", + " -0.06832974, -0.09220405],\n", + " [ 0.08529891, 0.05068012, 0.04445121, ..., -0.00259226,\n", + " 0.00286377, -0.02593034],\n", + " ...,\n", + " [ 0.04170844, 0.05068012, -0.01590626, ..., -0.01107952,\n", + " -0.04687948, 0.01549073],\n", + " [-0.04547248, -0.04464164, 0.03906215, ..., 0.02655962,\n", + " 0.04452837, -0.02593034],\n", + " [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,\n", + " -0.00421986, 0.00306441]]), 'target': array([151., 75., 141., 206., 135., 97., 138., 63., 110., 310., 101.,\n", + " 69., 179., 185., 118., 171., 166., 144., 97., 168., 68., 49.,\n", + " 68., 245., 184., 202., 137., 85., 131., 283., 129., 59., 341.,\n", + " 87., 65., 102., 265., 276., 252., 90., 100., 55., 61., 92.,\n", + " 259., 53., 190., 142., 75., 142., 155., 225., 59., 104., 182.,\n", + " 128., 52., 37., 170., 170., 61., 144., 52., 128., 71., 163.,\n", + " 150., 97., 160., 178., 48., 270., 202., 111., 85., 42., 170.,\n", + " 200., 252., 113., 143., 51., 52., 210., 65., 141., 55., 134.,\n", + " 42., 111., 98., 164., 48., 96., 90., 162., 150., 279., 92.,\n", + " 83., 128., 102., 302., 198., 95., 53., 134., 144., 232., 81.,\n", + " 104., 59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,\n", + " 173., 180., 84., 121., 161., 99., 109., 115., 268., 274., 158.,\n", + " 107., 83., 103., 272., 85., 280., 336., 281., 118., 317., 235.,\n", + " 60., 174., 259., 178., 128., 96., 126., 288., 88., 292., 71.,\n", + " 197., 186., 25., 84., 96., 195., 53., 217., 172., 131., 214.,\n", + " 59., 70., 220., 268., 152., 47., 74., 295., 101., 151., 127.,\n", + " 237., 225., 81., 151., 107., 64., 138., 185., 265., 101., 137.,\n", + " 143., 141., 79., 292., 178., 91., 116., 86., 122., 72., 129.,\n", + " 142., 90., 158., 39., 196., 222., 277., 99., 196., 202., 155.,\n", + " 77., 191., 70., 73., 49., 65., 263., 248., 296., 214., 185.,\n", + " 78., 93., 252., 150., 77., 208., 77., 108., 160., 53., 220.,\n", + " 154., 259., 90., 246., 124., 67., 72., 257., 262., 275., 177.,\n", + " 71., 47., 187., 125., 78., 51., 258., 215., 303., 243., 91.,\n", + " 150., 310., 153., 346., 63., 89., 50., 39., 103., 308., 116.,\n", + " 145., 74., 45., 115., 264., 87., 202., 127., 182., 241., 66.,\n", + " 94., 283., 64., 102., 200., 265., 94., 230., 181., 156., 233.,\n", + " 60., 219., 80., 68., 332., 248., 84., 200., 55., 85., 89.,\n", + " 31., 129., 83., 275., 65., 198., 236., 253., 124., 44., 172.,\n", + " 114., 142., 109., 180., 144., 163., 147., 97., 220., 190., 109.,\n", + " 191., 122., 230., 242., 248., 249., 192., 131., 237., 78., 135.,\n", + " 244., 199., 270., 164., 72., 96., 306., 91., 214., 95., 216.,\n", + " 263., 178., 113., 200., 139., 139., 88., 148., 88., 243., 71.,\n", + " 77., 109., 272., 60., 54., 221., 90., 311., 281., 182., 321.,\n", + " 58., 262., 206., 233., 242., 123., 167., 63., 197., 71., 168.,\n", + " 140., 217., 121., 235., 245., 40., 52., 104., 132., 88., 69.,\n", + " 219., 72., 201., 110., 51., 277., 63., 118., 69., 273., 258.,\n", + " 43., 198., 242., 232., 175., 93., 168., 275., 293., 281., 72.,\n", + " 140., 189., 181., 209., 136., 261., 113., 131., 174., 257., 55.,\n", + " 84., 42., 146., 212., 233., 91., 111., 152., 120., 67., 310.,\n", + " 94., 183., 66., 173., 72., 49., 64., 48., 178., 104., 132.,\n", + " 220., 57.]), 'DESCR': '.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - Age\\n - Sex\\n - Body mass index\\n - Average blood pressure\\n - S1\\n - S2\\n - S3\\n - S4\\n - S5\\n - S6\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)', 'feature_names': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], 'data_filename': '/Users/borjauria/miniconda3/envs/ironhack_env/lib/python3.7/site-packages/sklearn/datasets/data/diabetes_data.csv.gz', 'target_filename': '/Users/borjauria/miniconda3/envs/ironhack_env/lib/python3.7/site-packages/sklearn/datasets/data/diabetes_target.csv.gz'}\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "diabetes = load_diabetes()\n", + "print(diabetes)" ] }, { @@ -53,11 +117,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +149,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.DESCR)" ] }, { @@ -97,11 +219,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(442, 10)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Enter your answer here:\n" + "diabetes.data.shape\n", + "# Enter your answer here:\n", + "#There are 10 attributes in the data, it refers to the patient's age, sex, body mass index, average blood pressure and 6 glucose measurements.\n", + "# There are 442 records in the data" ] }, { @@ -115,11 +251,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(442,)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.target.shape" ] }, { @@ -156,11 +304,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression\n", + "X = diabetes['data']\n", + "y = diabetes['target']" ] }, { @@ -172,11 +323,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +342,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.model_selection import train_test_split\n", + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(X, y, test_size = 0.20)" ] }, { @@ -206,11 +360,65 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "151.86239034549334" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes_model.intercept_" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ -27.71391975, -267.08862827, 536.18199219, 332.4318144 ,\n", + " -716.85132584, 446.31139487, 73.94204506, 217.56183393,\n", + " 690.36686518, 72.62325409])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes_model.coef_" ] }, { @@ -231,11 +439,46 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([238.58574559, 91.50331068, 75.09325302, 64.88748908,\n", + " 115.81982316, 187.63818037, 113.99486439, 176.054878 ,\n", + " 78.92834033, 140.32636457, 144.84279661, 100.08484328,\n", + " 227.19186843, 147.10362276, 100.00082362, 207.46308607,\n", + " 190.16726124, 97.99009049, 234.76243416, 73.09364077,\n", + " 111.90262063, 229.82762169, 196.73944615, 86.10457733,\n", + " 191.87375378, 200.51580174, 99.1540136 , 173.07937935,\n", + " 234.08776711, 106.37633822, 232.80297325, 67.23026403,\n", + " 180.77663043, 251.73652467, 161.12191595, 112.60228742,\n", + " 87.88809539, 246.77918189, 118.68359552, 244.72963435,\n", + " 189.43631778, 143.13429118, 72.46345876, 143.20874428,\n", + " 142.05684509, 57.23448674, 135.32582782, 120.35824672,\n", + " 174.16913954, 106.59499321, 173.95730681, 126.49273779,\n", + " 52.77539112, 142.07084877, 229.58743125, 181.77285511,\n", + " 261.01845717, 90.90352337, 124.58937301, 210.70379539,\n", + " 102.78349004, 112.40582843, 197.92242404, 37.81043164,\n", + " 217.69058287, 222.19662037, 161.43074622, 142.52649468,\n", + " 170.98257978, 161.86560192, 123.42940713, 159.39838091,\n", + " 110.90948683, 186.27114682, 220.84531227, 76.27793437,\n", + " 221.01721341, 156.44510601, 161.95906927, 43.07618446,\n", + " 201.2911246 , 229.92636947, 239.1495099 , 205.63282517,\n", + " 116.30675283, 195.51938123, 118.67975747, 112.57204242,\n", + " 182.45551813])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_pred = diabetes_model.predict(diabetes_data_test)\n", + "y_pred" ] }, { @@ -247,11 +490,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[317. 101. 37. 83. 153. 84. 179. 277. 200. 190. 25. 81. 155. 172.\n", + " 118. 52. 78. 170. 261. 134. 61. 248. 292. 96. 178. 293. 135. 70.\n", + " 275. 111. 128. 39. 164. 245. 104. 53. 91. 264. 89. 252. 129. 50.\n", + " 55. 88. 93. 39. 124. 178. 283. 87. 91. 103. 57. 168. 246. 139.\n", + " 303. 64. 131. 288. 94. 200. 131. 104. 275. 152. 196. 116. 242. 252.\n", + " 150. 154. 160. 170. 295. 72. 192. 151. 144. 116. 233. 236. 270. 233.\n", + " 66. 123. 177. 107. 144.]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes_target_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3338.2019711743733" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "mean_squared_error(y_true = diabetes_target_test, y_pred = y_pred)" ] }, { @@ -263,11 +542,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:" ] }, { @@ -302,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -326,11 +605,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "# Your answers here:" + "# Your answers here:\n" ] }, { @@ -351,11 +630,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +647,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | mpg | \n", + "cylinders | \n", + "displacement | \n", + "horse_power | \n", + "weight | \n", + "acceleration | \n", + "model_year | \n", + "car_name | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "18.0 | \n", + "8 | \n", + "307.0 | \n", + "130.0 | \n", + "3504 | \n", + "12.0 | \n", + "70 | \n", + "\\t\"chevrolet chevelle malibu\" | \n", + "
| 1 | \n", + "15.0 | \n", + "8 | \n", + "350.0 | \n", + "165.0 | \n", + "3693 | \n", + "11.5 | \n", + "70 | \n", + "\\t\"buick skylark 320\" | \n", + "
| 2 | \n", + "18.0 | \n", + "8 | \n", + "318.0 | \n", + "150.0 | \n", + "3436 | \n", + "11.0 | \n", + "70 | \n", + "\\t\"plymouth satellite\" | \n", + "
| 3 | \n", + "16.0 | \n", + "8 | \n", + "304.0 | \n", + "150.0 | \n", + "3433 | \n", + "12.0 | \n", + "70 | \n", + "\\t\"amc rebel sst\" | \n", + "
| 4 | \n", + "17.0 | \n", + "8 | \n", + "302.0 | \n", + "140.0 | \n", + "3449 | \n", + "10.5 | \n", + "70 | \n", + "\\t\"ford torino\" | \n", + "