From 2f270165c2ec59297a778b4766b4f5366fc48010 Mon Sep 17 00:00:00 2001 From: almsasantos Date: Sat, 7 Mar 2020 13:59:58 +0100 Subject: [PATCH 1/4] create pull --- module-3/lab-supervised-learning-sklearn/your-code/main.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..244f2661 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -726,7 +726,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.5" } }, "nbformat": 4, From 6d3e719413635ca0d71e00867a788353e1441516 Mon Sep 17 00:00:00 2001 From: almsasantos Date: Sat, 7 Mar 2020 14:02:12 +0100 Subject: [PATCH 2/4] create pull --- module-3/lab-supervised-learning-sklearn/your-code/main.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 244f2661..891e69c6 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -16,7 +16,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd" ] }, { From adb1852aba6a9ad2e0b73883d9d06b21a78d6495 Mon Sep 17 00:00:00 2001 From: almsasantos Date: Sun, 8 Mar 2020 20:22:54 +0100 Subject: [PATCH 3/4] still gotta do challenge 5 and bonus --- .../your-code/main.ipynb | 735 ++++++++++++++++-- 1 file changed, 658 insertions(+), 77 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 891e69c6..99cfc233 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,12 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import your libraries:\n", - "import pandas as pd" + "import pandas as pd\n", + "from sklearn.datasets import load_diabetes" ] }, { @@ -38,11 +39,98 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes = load_diabetes()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': array([[ 0.03807591, 0.05068012, 0.06169621, ..., -0.00259226,\n", + " 0.01990842, -0.01764613],\n", + " [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,\n", + " -0.06832974, -0.09220405],\n", + " [ 0.08529891, 0.05068012, 0.04445121, ..., -0.00259226,\n", + " 0.00286377, -0.02593034],\n", + " ...,\n", + " [ 0.04170844, 0.05068012, -0.01590626, ..., -0.01107952,\n", + " -0.04687948, 0.01549073],\n", + " [-0.04547248, -0.04464164, 0.03906215, ..., 0.02655962,\n", + " 0.04452837, -0.02593034],\n", + " [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,\n", + " -0.00421986, 0.00306441]]),\n", + " 'target': array([151., 75., 141., 206., 135., 97., 138., 63., 110., 310., 101.,\n", + " 69., 179., 185., 118., 171., 166., 144., 97., 168., 68., 49.,\n", + " 68., 245., 184., 202., 137., 85., 131., 283., 129., 59., 341.,\n", + " 87., 65., 102., 265., 276., 252., 90., 100., 55., 61., 92.,\n", + " 259., 53., 190., 142., 75., 142., 155., 225., 59., 104., 182.,\n", + " 128., 52., 37., 170., 170., 61., 144., 52., 128., 71., 163.,\n", + " 150., 97., 160., 178., 48., 270., 202., 111., 85., 42., 170.,\n", + " 200., 252., 113., 143., 51., 52., 210., 65., 141., 55., 134.,\n", + " 42., 111., 98., 164., 48., 96., 90., 162., 150., 279., 92.,\n", + " 83., 128., 102., 302., 198., 95., 53., 134., 144., 232., 81.,\n", + " 104., 59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,\n", + " 173., 180., 84., 121., 161., 99., 109., 115., 268., 274., 158.,\n", + " 107., 83., 103., 272., 85., 280., 336., 281., 118., 317., 235.,\n", + " 60., 174., 259., 178., 128., 96., 126., 288., 88., 292., 71.,\n", + " 197., 186., 25., 84., 96., 195., 53., 217., 172., 131., 214.,\n", + " 59., 70., 220., 268., 152., 47., 74., 295., 101., 151., 127.,\n", + " 237., 225., 81., 151., 107., 64., 138., 185., 265., 101., 137.,\n", + " 143., 141., 79., 292., 178., 91., 116., 86., 122., 72., 129.,\n", + " 142., 90., 158., 39., 196., 222., 277., 99., 196., 202., 155.,\n", + " 77., 191., 70., 73., 49., 65., 263., 248., 296., 214., 185.,\n", + " 78., 93., 252., 150., 77., 208., 77., 108., 160., 53., 220.,\n", + " 154., 259., 90., 246., 124., 67., 72., 257., 262., 275., 177.,\n", + " 71., 47., 187., 125., 78., 51., 258., 215., 303., 243., 91.,\n", + " 150., 310., 153., 346., 63., 89., 50., 39., 103., 308., 116.,\n", + " 145., 74., 45., 115., 264., 87., 202., 127., 182., 241., 66.,\n", + " 94., 283., 64., 102., 200., 265., 94., 230., 181., 156., 233.,\n", + " 60., 219., 80., 68., 332., 248., 84., 200., 55., 85., 89.,\n", + " 31., 129., 83., 275., 65., 198., 236., 253., 124., 44., 172.,\n", + " 114., 142., 109., 180., 144., 163., 147., 97., 220., 190., 109.,\n", + " 191., 122., 230., 242., 248., 249., 192., 131., 237., 78., 135.,\n", + " 244., 199., 270., 164., 72., 96., 306., 91., 214., 95., 216.,\n", + " 263., 178., 113., 200., 139., 139., 88., 148., 88., 243., 71.,\n", + " 77., 109., 272., 60., 54., 221., 90., 311., 281., 182., 321.,\n", + " 58., 262., 206., 233., 242., 123., 167., 63., 197., 71., 168.,\n", + " 140., 217., 121., 235., 245., 40., 52., 104., 132., 88., 69.,\n", + " 219., 72., 201., 110., 51., 277., 63., 118., 69., 273., 258.,\n", + " 43., 198., 242., 232., 175., 93., 168., 275., 293., 281., 72.,\n", + " 140., 189., 181., 209., 136., 261., 113., 131., 174., 257., 55.,\n", + " 84., 42., 146., 212., 233., 91., 111., 152., 120., 67., 310.,\n", + " 94., 183., 66., 173., 72., 49., 64., 48., 178., 104., 132.,\n", + " 220., 57.]),\n", + " 'DESCR': '.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - Age\\n - Sex\\n - Body mass index\\n - Average blood pressure\\n - S1\\n - S2\\n - S3\\n - S4\\n - S5\\n - S6\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)',\n", + " 'feature_names': ['age',\n", + " 'sex',\n", + " 'bmi',\n", + " 'bp',\n", + " 's1',\n", + " 's2',\n", + " 's3',\n", + " 's4',\n", + " 's5',\n", + " 's6'],\n", + " 'data_filename': '/home/almsasantos/miniconda3/envs/data_env/lib/python3.7/site-packages/sklearn/datasets/data/diabetes_data.csv.gz',\n", + " 'target_filename': '/home/almsasantos/miniconda3/envs/data_env/lib/python3.7/site-packages/sklearn/datasets/data/diabetes_target.csv.gz'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes" ] }, { @@ -54,11 +142,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -74,13 +174,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.DESCR)" ] }, { @@ -98,11 +244,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "#There are 10 attributes on the data, the age of the patient, their sex, body mass index, average blood pressure and 6 measures of glucose.\n", + "#Based on the data we want to predict if\n", + "#There are 442 records in the data" ] }, { @@ -116,11 +265,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(442, 10)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(442,)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes.target.shape" ] }, { @@ -157,11 +338,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression\n", + "X = diabetes['data']\n", + "y = diabetes['target']" ] }, { @@ -173,11 +357,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -191,11 +376,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.model_selection import train_test_split\n", + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(X, y, test_size=0.20)" ] }, { @@ -207,11 +394,65 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "152.44583955694154" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes_model.intercept_" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ -89.45281236, -229.06384997, 518.83510703, 299.72996436,\n", + " -845.23992693, 549.2794504 , 120.58292975, 173.00861293,\n", + " 830.53440923, 44.73346357])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes_model.coef_" ] }, { @@ -232,11 +473,46 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([138.89214536, 69.12661733, 121.94914756, 113.19129322,\n", + " 156.99751524, 160.26669269, 80.08848189, 61.30710601,\n", + " 84.73618699, 107.89958299, 151.35937186, 60.64696835,\n", + " 167.87414486, 79.27804032, 179.04698731, 233.58519952,\n", + " 112.96390722, 133.42174259, 257.65542174, 88.16482361,\n", + " 120.93394734, 103.8417481 , 103.57188763, 149.68285436,\n", + " 133.42796845, 124.7837086 , 140.57513167, 158.30971152,\n", + " 159.80344786, 77.79931922, 120.09548695, 168.00783093,\n", + " 123.37939875, 199.45131627, 90.26752494, 195.23115095,\n", + " 135.44962218, 224.249665 , 71.97282119, 164.8514765 ,\n", + " 265.94969922, 205.21673439, 141.45721116, 250.05254518,\n", + " 183.47955904, 212.41379545, 75.20164627, 87.75472013,\n", + " 222.42848036, 127.05644988, 87.78706082, 176.76840359,\n", + " 197.17051518, 126.35877742, 124.01929452, 97.6539756 ,\n", + " 75.06391055, 196.70493044, 91.77547822, 241.22160918,\n", + " 107.68270556, 132.26875559, 228.51409243, 192.45648231,\n", + " 141.97675603, 148.40867286, 176.19876293, 95.03557368,\n", + " 48.0875041 , 116.74774667, 123.65244898, 211.27805279,\n", + " 112.4169273 , 119.835268 , 66.47694782, 99.19859547,\n", + " 104.00187161, 50.95801923, 204.56926326, 120.06261764,\n", + " 249.14681134, 114.46870686, 108.36230907, 129.60594826,\n", + " 107.72571814, 126.7724346 , 152.55197824, 184.0222398 ,\n", + " 178.90568195])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_pred = diabetes_model.predict(diabetes_data_test)\n", + "y_pred" ] }, { @@ -248,11 +524,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 59. 39. 97. 90. 178. 276. 55. 70. 114. 63. 61. 77. 277. 77.\n", + " 85. 99. 127. 170. 341. 84. 59. 128. 199. 118. 150. 191. 182. 94.\n", + " 85. 51. 83. 180. 160. 272. 71. 178. 230. 261. 128. 131. 308. 288.\n", + " 168. 245. 139. 163. 138. 96. 192. 144. 91. 217. 293. 49. 113. 49.\n", + " 65. 142. 51. 259. 71. 162. 246. 78. 83. 190. 174. 81. 116. 68.\n", + " 53. 221. 160. 96. 96. 101. 104. 63. 68. 64. 215. 200. 94. 40.\n", + " 182. 84. 155. 175. 283.]\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes_target_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3072.9763342489605" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_true=diabetes_target_test, y_pred=y_pred)" ] }, { @@ -264,11 +584,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "#No" ] }, { @@ -303,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -327,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -352,11 +673,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -368,11 +690,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head()" ] }, { @@ -384,11 +819,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg float64\n", + "cylinders int64\n", + "displacement float64\n", + "horse_power float64\n", + "weight int64\n", + "acceleration float64\n", + "model_year int64\n", + "car_name object\n", + "dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.dtypes" ] }, { @@ -400,11 +855,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The newest model year is 70 and the oldest model year is 82\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "newest_model = auto['model_year'].sort_values()[0]\n", + "oldest_model = auto['model_year'].sort_values(ascending=False)[381]\n", + "print(f'The newest model year is {newest_model} and the oldest model year is {oldest_model}')" ] }, { @@ -416,11 +882,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "auto.dropna(axis=0, inplace=True)" ] }, { @@ -432,11 +927,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 5 possible values of cylinders\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto['cylinders'].value_counts()\n", + "print(f'There are {len(auto.cylinders.unique())} possible values of cylinders')" ] }, { @@ -452,11 +957,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.drop('car_name', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "X = auto.drop('mpg', axis=1)\n", + "y = auto['mpg'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, { @@ -470,11 +995,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model = LinearRegression()\n", + "auto_model.fit(X_train, y_train)" ] }, { @@ -503,11 +1041,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred = auto_model.predict(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import r2_score" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8013399085677433" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train, y_pred)" ] }, { @@ -523,11 +1091,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8272654354467638" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r2_score(y_test, y_test_pred)" ] }, { @@ -552,7 +1133,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -568,7 +1149,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -584,7 +1165,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -600,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -620,7 +1201,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -636,7 +1217,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -652,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -670,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -686,7 +1267,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -713,9 +1294,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:data_env]", "language": "python", - "name": "python3" + "name": "conda-env-data_env-py" }, "language_info": { "codemirror_mode": { From 7cd83d1c1761dec35cbb1f425d808d99a0edd5b5 Mon Sep 17 00:00:00 2001 From: almsasantos Date: Mon, 16 Mar 2020 12:28:30 +0100 Subject: [PATCH 4/4] everything done, except bonus 1 --- .../your-code/main.ipynb | 307 +++++++++++++++--- 1 file changed, 253 insertions(+), 54 deletions(-) diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 99cfc233..9b815120 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -421,7 +421,7 @@ { "data": { "text/plain": [ - "152.44583955694154" + "151.62325584057453" ] }, "execution_count": 13, @@ -441,9 +441,9 @@ { "data": { "text/plain": [ - "array([ -89.45281236, -229.06384997, 518.83510703, 299.72996436,\n", - " -845.23992693, 549.2794504 , 120.58292975, 173.00861293,\n", - " 830.53440923, 44.73346357])" + "array([ -61.60664965, -260.44060769, 544.72083274, 280.13157371,\n", + " -607.34982048, 288.11490872, 14.1194967 , 166.80860998,\n", + " 707.22691646, 70.97826776])" ] }, "execution_count": 14, @@ -479,29 +479,29 @@ { "data": { "text/plain": [ - "array([138.89214536, 69.12661733, 121.94914756, 113.19129322,\n", - " 156.99751524, 160.26669269, 80.08848189, 61.30710601,\n", - " 84.73618699, 107.89958299, 151.35937186, 60.64696835,\n", - " 167.87414486, 79.27804032, 179.04698731, 233.58519952,\n", - " 112.96390722, 133.42174259, 257.65542174, 88.16482361,\n", - " 120.93394734, 103.8417481 , 103.57188763, 149.68285436,\n", - " 133.42796845, 124.7837086 , 140.57513167, 158.30971152,\n", - " 159.80344786, 77.79931922, 120.09548695, 168.00783093,\n", - " 123.37939875, 199.45131627, 90.26752494, 195.23115095,\n", - " 135.44962218, 224.249665 , 71.97282119, 164.8514765 ,\n", - " 265.94969922, 205.21673439, 141.45721116, 250.05254518,\n", - " 183.47955904, 212.41379545, 75.20164627, 87.75472013,\n", - " 222.42848036, 127.05644988, 87.78706082, 176.76840359,\n", - " 197.17051518, 126.35877742, 124.01929452, 97.6539756 ,\n", - " 75.06391055, 196.70493044, 91.77547822, 241.22160918,\n", - " 107.68270556, 132.26875559, 228.51409243, 192.45648231,\n", - " 141.97675603, 148.40867286, 176.19876293, 95.03557368,\n", - " 48.0875041 , 116.74774667, 123.65244898, 211.27805279,\n", - " 112.4169273 , 119.835268 , 66.47694782, 99.19859547,\n", - " 104.00187161, 50.95801923, 204.56926326, 120.06261764,\n", - " 249.14681134, 114.46870686, 108.36230907, 129.60594826,\n", - " 107.72571814, 126.7724346 , 152.55197824, 184.0222398 ,\n", - " 178.90568195])" + "array([ 66.37935132, 88.84819977, 85.93016773, 137.93624351,\n", + " 249.69856319, 117.97000149, 166.41024337, 231.14148398,\n", + " 156.65895101, 81.90045079, 157.19526302, 169.45360892,\n", + " 214.00558451, 296.761621 , 195.85525992, 135.67550785,\n", + " 112.28077343, 124.43548197, 94.01643575, 118.38802303,\n", + " 174.33980063, 177.12239007, 212.09002138, 114.41655502,\n", + " 162.67844191, 184.5950349 , 215.83846784, 96.60362119,\n", + " 144.99444982, 225.88063695, 159.49664665, 171.13016093,\n", + " 215.06142677, 110.25870676, 164.84506098, 65.54869156,\n", + " 191.56656633, 94.08292533, 162.08119093, 125.11978521,\n", + " 91.01553723, 107.16473123, 170.56909167, 199.19054466,\n", + " 146.64258834, 254.62508693, 235.26123914, 73.48024846,\n", + " 97.24764758, 122.52878271, 252.34718987, 110.97011029,\n", + " 232.78000165, 144.44212693, 230.84327562, 65.29700824,\n", + " 83.67883215, 105.44311055, 129.151523 , 281.66828128,\n", + " 123.54115313, 61.87309155, 204.42153502, 196.03498841,\n", + " 229.32866686, 178.88730863, 191.9882965 , 129.97907864,\n", + " 170.26795101, 180.77290578, 183.27743198, 178.26576001,\n", + " 62.61083204, 156.70456891, 57.53442299, 95.60119443,\n", + " 127.14478011, 132.35303867, 96.57270416, 188.04159405,\n", + " 149.048561 , 98.43510818, 93.08321169, 79.50374091,\n", + " 219.87684007, 225.05178691, 221.80598641, 156.20247408,\n", + " 104.23131512])" ] }, "execution_count": 15, @@ -531,13 +531,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ 59. 39. 97. 90. 178. 276. 55. 70. 114. 63. 61. 77. 277. 77.\n", - " 85. 99. 127. 170. 341. 84. 59. 128. 199. 118. 150. 191. 182. 94.\n", - " 85. 51. 83. 180. 160. 272. 71. 178. 230. 261. 128. 131. 308. 288.\n", - " 168. 245. 139. 163. 138. 96. 192. 144. 91. 217. 293. 49. 113. 49.\n", - " 65. 142. 51. 259. 71. 162. 246. 78. 83. 190. 174. 81. 116. 68.\n", - " 53. 221. 160. 96. 96. 101. 104. 63. 68. 64. 215. 200. 94. 40.\n", - " 182. 84. 155. 175. 283.]\n" + "[ 43. 158. 181. 140. 243. 64. 77. 246. 154. 72. 196. 180. 52. 258.\n", + " 78. 142. 97. 68. 84. 60. 144. 147. 166. 53. 138. 257. 220. 118.\n", + " 197. 268. 127. 171. 180. 90. 216. 96. 191. 74. 245. 214. 71. 129.\n", + " 184. 293. 103. 281. 317. 89. 170. 145. 336. 253. 321. 93. 281. 75.\n", + " 42. 69. 49. 230. 44. 52. 151. 186. 152. 263. 161. 178. 217. 283.\n", + " 232. 107. 70. 94. 39. 108. 170. 230. 81. 202. 134. 101. 69. 65.\n", + " 173. 225. 295. 178. 125.]\n" ] } ], @@ -563,7 +563,7 @@ { "data": { "text/plain": [ - "3072.9763342489605" + "2976.748095762798" ] }, "execution_count": 18, @@ -1066,7 +1066,7 @@ { "data": { "text/plain": [ - "0.8013399085677433" + "0.8236124478526281" ] }, "execution_count": 35, @@ -1097,7 +1097,7 @@ { "data": { "text/plain": [ - "0.8272654354467638" + "0.7436589879920421" ] }, "execution_count": 36, @@ -1137,7 +1137,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(X, y, test_size=0.1)" ] }, { @@ -1153,7 +1154,28 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09 = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto_model09.fit(X_train09, y_train09)" ] }, { @@ -1165,11 +1187,53 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred_train09 = auto_model09.predict(X_train09)\n", + "y_pred_test09 = auto_model09.predict(X_test09)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8176093617328014" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train09, y_pred_train09)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7260025114610873" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_test09, y_pred_test09)" ] }, { @@ -1181,11 +1245,20 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The r squared score for the new training data is 0.8176093617328014 and the r squared score for the new testing data is 0.7260025114610873\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(f'The r squared score for the new training data is {r2_score(y_train09, y_pred_train09)} and the r squared score for the new testing data is {r2_score(y_test09, y_pred_test09)}')" ] }, { @@ -1201,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1217,11 +1290,13 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model = LinearRegression()\n", + "selector = RFE(estimator = auto_model, n_features_to_select=3)" ] }, { @@ -1233,11 +1308,45 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", + " normalize=False),\n", + " n_features_to_select=3, step=1, verbose=0)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "selector.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 4, 3, 1, 1])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selector.ranking_" ] }, { @@ -1251,11 +1360,21 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X = auto.drop(['mpg', 'displacement', 'horse_power', 'weight'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X, y, train_size=0.8)" ] }, { @@ -1267,11 +1386,91 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here: \n", + "auto_model_reduced = LinearRegression()\n", + "auto_model_reduced.fit(X_train_reduced, y_train_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "# Your code here: \n" + "y_pred_train_reduced = auto_model_reduced.predict(X_train_reduced)\n", + "y_pred_test_reduced = auto_model_reduced.predict(X_test_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7190692512547585" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train_reduced, y_pred_train_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6945400439388549" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_test_reduced, y_pred_test_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The r squared score for the new training data is 0.7190692512547585 and the r squared score for the new testing data is 0.6945400439388549\n" + ] + } + ], + "source": [ + "print(f'The r squared score for the new training data is {r2_score(y_train_reduced, y_pred_train_reduced)} and the r squared score for the new testing data is {r2_score(y_test_reduced, y_pred_test_reduced)}')" ] }, {