diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 828092657..7f598e8a8 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -96,7 +96,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# the number of observations (178)\n", + "178" ] }, { @@ -114,7 +115,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# to get number of variables (14)\n", + "14" ] }, { @@ -132,7 +134,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Variable type\n", + "integer\n", + "\n", + "# unique_values\n", + "[0,1,2]" ] }, { @@ -151,7 +157,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# number of predictor variables (13)\n", + "13" ] }, { @@ -204,7 +211,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> Standardization is essentail since it ensure that no one feature has an excessive impact on the distance metric, improving model accuracy and facilitating more equitable comparisons." ] }, { @@ -220,7 +227,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> since the class variable is a category target rather than a numerical attribute, we do not normalize it. KNN uses class only to assign labels, not to compute distances. " ] }, { @@ -236,7 +243,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> the random seed ensures that any calculation involving randomness produces the same result every time when we run the code. The exact seed value is irrelevant but it must be fixed if we want consistenct results between runs." ] }, { @@ -261,7 +268,23 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "# predictors (X) and response variable (y)\n", + "X = predictors_standardized\n", + "y = wine_df['class']\n", + "\n", + "# 75/25 train-test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, \n", + " test_size=0.25, \n", + " random_state=123,\n", + " stratify=y \n", + ")\n", + "\n", + "# non-overlapping splits\n", + "print(f\"X_train shape: {X_train.shape}\")\n", + "print(f\"X_test shape: {X_test.shape}\")\n", + "print(f\"y_train shape: {y_train.shape}\")\n", + "print(f\"y_test shape: {y_test.shape}\")" ] }, { @@ -289,7 +312,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# Best Value (15)\n", + "knn = KNeighborsClassifier()\n", + "param_grid = {'n_neighbors': list(range(1, 51))}\n", + "\n", + "grid_search = GridSearchCV(estimator=knn,\n", + " param_grid=param_grid,\n", + " cv=10, \n", + " scoring='accuracy', \n", + " n_jobs=-1) \n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "f\"Best number of neighbors (n_neighbors): {best_k}\"\n" ] }, { @@ -310,7 +345,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# Using accuracy_score\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "best_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "best_knn.fit(X_train, y_train)\n", + "\n", + "y_pred = best_knn.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "f\"Test set accuracy with k = {best_k}: {accuracy:.4f}\"" ] }, {