From e93192f744ef22f742b2fe1d8635b3a460bf8807 Mon Sep 17 00:00:00 2001 From: Shailesh Thapa Date: Sun, 18 May 2025 22:42:33 -0400 Subject: [PATCH 1/3] assignment 1 complete --- 02_activities/assignments/assignment_1.ipynb | 66 +++++++++++++++++--- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 828092657..fef43dcf6 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -96,7 +96,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# the number of observations (178)\n", + "number_of_rows = wine_df.shape[0]\n", + "f\"Number of observations (rows): {number_of_rows}\"" ] }, { @@ -114,7 +116,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# to get number of variables (14)\n", + "number_of_columns = wine_df.shape[1]\n", + "f\"Number of variables (columns): {number_of_columns}\"" ] }, { @@ -132,7 +136,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Variable type\n", + "wine_df['class'].dtype\n", + "\n", + "# unique_values\n", + "wine_df['class'].unique()" ] }, { @@ -151,7 +159,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# number of predictor variables (13)\n", + "number_of_predictors = wine_df.shape[1] - 1\n", + "f\"Number of predictor variables: {number_of_predictors}\"" ] }, { @@ -204,7 +214,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> Standardization is essentail since it ensure that no one feature has an excessive impact on the distance metric, improving model accuracy and facilitating more equitable comparisons." ] }, { @@ -220,7 +230,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> since the class variable is a category target rather than a numerical attribute, we do not normalize it. KNN uses class only to assign labels, not to compute distances. " ] }, { @@ -236,7 +246,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> the random seed ensures that any calculation involving randomness produces the same result every time when we run the code. The exact seed value is irrelevant but it must be fixed if we want consistenct results between runs." ] }, { @@ -261,7 +271,22 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "# predictors (X) and response variable (y)\n", + "X = predictors_standardized\n", + "y = wine_df['class']\n", + "\n", + "# 75/25 train-test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, \n", + " test_size=0.25, \n", + " random_state=123 \n", + ")\n", + "\n", + "# non-overlapping splits\n", + "print(f\"X_train shape: {X_train.shape}\")\n", + "print(f\"X_test shape: {X_test.shape}\")\n", + "print(f\"y_train shape: {y_train.shape}\")\n", + "print(f\"y_test shape: {y_test.shape}\")" ] }, { @@ -289,7 +314,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# Best Value (15)\n", + "knn = KNeighborsClassifier()\n", + "param_grid = {'n_neighbors': list(range(1, 51))}\n", + "\n", + "grid_search = GridSearchCV(estimator=knn,\n", + " param_grid=param_grid,\n", + " cv=10, \n", + " scoring='accuracy', \n", + " n_jobs=-1) \n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "f\"Best number of neighbors (n_neighbors): {best_k}\"\n" ] }, { @@ -310,7 +347,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# Using accuracy_score\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "best_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "best_knn.fit(X_train, y_train)\n", + "\n", + "y_pred = best_knn.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "f\"Test set accuracy with k = {best_k}: {accuracy:.4f}\"" ] }, { From a6a36e5e710e476b72605058bd18969e13e7834d Mon Sep 17 00:00:00 2001 From: Shailesh Thapa Date: Tue, 20 May 2025 15:58:19 -0400 Subject: [PATCH 2/3] updated --- 02_activities/assignments/assignment_1.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index fef43dcf6..6c64b9064 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -279,7 +279,8 @@ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, \n", " test_size=0.25, \n", - " random_state=123 \n", + " random_state=123,\n", + " stratify=y \n", ")\n", "\n", "# non-overlapping splits\n", From aece5a487e4a38f447c7f4f2284d9311ae68edf3 Mon Sep 17 00:00:00 2001 From: Shailesh Thapa Date: Wed, 21 May 2025 10:57:54 -0400 Subject: [PATCH 3/3] New updated. --- 02_activities/assignments/assignment_1.ipynb | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 6c64b9064..7f598e8a8 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -97,8 +97,7 @@ "outputs": [], "source": [ "# the number of observations (178)\n", - "number_of_rows = wine_df.shape[0]\n", - "f\"Number of observations (rows): {number_of_rows}\"" + "178" ] }, { @@ -117,8 +116,7 @@ "outputs": [], "source": [ "# to get number of variables (14)\n", - "number_of_columns = wine_df.shape[1]\n", - "f\"Number of variables (columns): {number_of_columns}\"" + "14" ] }, { @@ -137,10 +135,10 @@ "outputs": [], "source": [ "# Variable type\n", - "wine_df['class'].dtype\n", + "integer\n", "\n", "# unique_values\n", - "wine_df['class'].unique()" + "[0,1,2]" ] }, { @@ -160,8 +158,7 @@ "outputs": [], "source": [ "# number of predictor variables (13)\n", - "number_of_predictors = wine_df.shape[1] - 1\n", - "f\"Number of predictor variables: {number_of_predictors}\"" + "13" ] }, {