Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 54 additions & 10 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# the number of observations (178)\n",
"178"
]
},
{
Expand All @@ -114,7 +115,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# to get number of variables (14)\n",
"14"
]
},
{
Expand All @@ -132,7 +134,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# Variable type\n",
"integer\n",
"\n",
"# unique_values\n",
"[0,1,2]"
]
},
{
Expand All @@ -151,7 +157,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# number of predictor variables (13)\n",
"13"
]
},
{
Expand Down Expand Up @@ -204,7 +211,7 @@
"id": "403ef0bb",
"metadata": {},
"source": [
"> Your answer here..."
"> Standardization is essentail since it ensure that no one feature has an excessive impact on the distance metric, improving model accuracy and facilitating more equitable comparisons."
]
},
{
Expand All @@ -220,7 +227,7 @@
"id": "fdee5a15",
"metadata": {},
"source": [
"> Your answer here..."
"> since the class variable is a category target rather than a numerical attribute, we do not normalize it. KNN uses class only to assign labels, not to compute distances. "
]
},
{
Expand All @@ -236,7 +243,7 @@
"id": "f0676c21",
"metadata": {},
"source": [
"> Your answer here..."
"> the random seed ensures that any calculation involving randomness produces the same result every time when we run the code. The exact seed value is irrelevant but it must be fixed if we want consistenct results between runs."
]
},
{
Expand All @@ -261,7 +268,23 @@
"\n",
"# split the data into a training and testing set. hint: use train_test_split !\n",
"\n",
"# Your code here ..."
"# predictors (X) and response variable (y)\n",
"X = predictors_standardized\n",
"y = wine_df['class']\n",
"\n",
"# 75/25 train-test split\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, \n",
" test_size=0.25, \n",
" random_state=123,\n",
" stratify=y \n",
")\n",
"\n",
"# non-overlapping splits\n",
"print(f\"X_train shape: {X_train.shape}\")\n",
"print(f\"X_test shape: {X_test.shape}\")\n",
"print(f\"y_train shape: {y_train.shape}\")\n",
"print(f\"y_test shape: {y_test.shape}\")"
]
},
{
Expand Down Expand Up @@ -289,7 +312,19 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here..."
"# Best Value (15)\n",
"knn = KNeighborsClassifier()\n",
"param_grid = {'n_neighbors': list(range(1, 51))}\n",
"\n",
"grid_search = GridSearchCV(estimator=knn,\n",
" param_grid=param_grid,\n",
" cv=10, \n",
" scoring='accuracy', \n",
" n_jobs=-1) \n",
"\n",
"grid_search.fit(X_train, y_train)\n",
"best_k = grid_search.best_params_['n_neighbors']\n",
"f\"Best number of neighbors (n_neighbors): {best_k}\"\n"
]
},
{
Expand All @@ -310,7 +345,16 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here..."
"# Using accuracy_score\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"best_k = grid_search.best_params_['n_neighbors']\n",
"best_knn = KNeighborsClassifier(n_neighbors=best_k)\n",
"best_knn.fit(X_train, y_train)\n",
"\n",
"y_pred = best_knn.predict(X_test)\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"f\"Test set accuracy with k = {best_k}: {accuracy:.4f}\""
]
},
{
Expand Down