From fa5c8b8b956155a3c3dae345de48d85163b9cfa5 Mon Sep 17 00:00:00 2001
From: "naveen.atukorala@pyxeda.ai" <naveen.atukorala@pyxeda.ai>
Date: Mon, 8 Nov 2021 09:34:53 +0530
Subject: [PATCH] random forest for human activity prediction

---
 Chapter14/activityPrediction/README.md        |  15 ++
 .../random_forest_activity_predictor.ipynb    | 190 ++++++++++++++++++
 .../random_forest_activity_predictor.py       | 103 +++++++++-
 3 files changed, 305 insertions(+), 3 deletions(-)
 create mode 100644 Chapter14/activityPrediction/random_forest_activity_predictor.ipynb

diff --git a/Chapter14/activityPrediction/README.md b/Chapter14/activityPrediction/README.md
index e69de29..702d9ca 100644
--- a/Chapter14/activityPrediction/README.md
+++ b/Chapter14/activityPrediction/README.md
@@ -0,0 +1,15 @@
+What it does : 
+
+    1. This Python program implements random forest model for human activity prediction using UCI HAR Dataset.
+
+Dependancies :
+
+    1. sklearn module is needed to be installed in the local machine to run this program. 
+
+Things to check before running :
+
+    1. Check whether you have given the correct location of your dataset file.
+    2. You should have access to the file in the Google Drive.
+
+
+ 
\ No newline at end of file
diff --git a/Chapter14/activityPrediction/random_forest_activity_predictor.ipynb b/Chapter14/activityPrediction/random_forest_activity_predictor.ipynb
new file mode 100644
index 0000000..6a4a303
--- /dev/null
+++ b/Chapter14/activityPrediction/random_forest_activity_predictor.ipynb
@@ -0,0 +1,190 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "random_forest_activity_predictor.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8xLWQeV4UPh2"
+      },
+      "source": [
+        "# **Problem: Random forest for human activity prediction.**\n",
+        "\n",
+        "Random forest for human activity prediction using UCI HAR Dataset.\n",
+        "\n",
+        "**Examples:**\n",
+        "\n",
+        "Change the variable 'url' by providing the google drive URL of the zip file, that you want to download.\n",
+        "\n",
+        "Eg:url = 'https://drive.google.com/file/d/1K7izykrla-qEuekekLayfGddml17calY/view?usp=sharing'\n",
+        "\n",
+        "Run all the cells. After executing the last cell, you will see the bthe accuracy score of the model as well.\n",
+        "\n",
+        "**Notes:**\n",
+        "\n",
+        "Following things are needed to be checked before running the program.\n",
+        " 1. Sklearn module is needed to run this program in a notebook. \n",
+        " 2. Check whether you have given the correct location of your zip file.\n",
+        " 3. You should have access to the file in the Google Drive.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V_xkWyuYVRl_"
+      },
+      "source": [
+        "# **Import Modules**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2iGN-VmnVVxC"
+      },
+      "source": [
+        "# Import pandas\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Import RandomForestClassifier since we are using random forest for this problem\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "\n",
+        "# Import GridSearchCV\n",
+        "from sklearn.model_selection import GridSearchCV\n",
+        "\n",
+        "# Import accuracy_score to calculate accuracy\n",
+        "from sklearn.metrics import accuracy_score\n",
+        "\n",
+        "# Import gdown module to download files from google drive\n",
+        "import gdown\n",
+        "\n",
+        "# Import zip file module to open the zip file\n",
+        "from zipfile import ZipFile"
+      ],
+      "execution_count": 1,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mPLYAU7ZZIgv"
+      },
+      "source": [
+        "# **Get the file location from google drive**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zzNzKjELZOUV"
+      },
+      "source": [
+        "# Please change the URL as needed (make sure you have the access to the file)\n",
+        "\n",
+        "url = 'https://drive.google.com/file/d/1z_zn7vv-Sk60fdoQuPN3h9wkyP8H-FQR/view?usp=sharing'\n",
+        "\n",
+        "# Derive the file id from the URL\n",
+        "file_id = url.split('/')[-2]\n",
+        "\n",
+        "# Derive the download url of the the file\n",
+        "download_url = 'https://drive.google.com/uc?id=' + file_id\n",
+        "\n",
+        "# Give the location you want to save\n",
+        "file_location = 'UCI_HAR_Dataset.zip'\n",
+        "\n",
+        "# Download the file from drive\n",
+        "gdown.download(download_url, file_location, quiet=False)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9OKD1-GlaDDv"
+      },
+      "source": [
+        "# **Unzip the zip dataset**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nY7UjA3UaFFN"
+      },
+      "source": [
+        "!unzip /content/UCI_HAR_Dataset.zip -d \"/content/unzipped_folder/\""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "m25lZG87ZjJd"
+      },
+      "source": [
+        "# **Begin Activity prediction operation**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ShhkO__YZkrl"
+      },
+      "source": [
+        "# Read train and test file using pandas\n",
+        "xtrain=pd.read_table(r'/content/unzipped_folder/UCI HAR Dataset/train/X_train.txt',delim_whitespace=True,header=None)\n",
+        "\n",
+        "\n",
+        "xtest=pd.read_table(r'/content/unzipped_folder/UCI HAR Dataset/test/X_test.txt',delim_whitespace=True,header=None)\n",
+        "\n",
+        "\n",
+        "ytrain=pd.read_table(r'/content/unzipped_folder/UCI HAR Dataset/train/y_train.txt',header=None)\n",
+        "\n",
+        "\n",
+        "ytest=pd.read_table(r'/content/unzipped_folder/UCI HAR Dataset/test/y_test.txt',header=None)\n",
+        "\n",
+        "# Return first 5 raws of the xtrain dataframe\n",
+        "xtrain.head()\n",
+        "\n",
+        "# Initialize randomforest classifier\n",
+        "classifier = RandomForestClassifier()\n",
+        "\n",
+        "# Define parameters for GridSearchCV method below\n",
+        "parameters = {'n_estimators': [10, 100, 1000], 'max_depth': [3, 6, 9], 'max_features' : ['auto', 'log2']}\n",
+        "\n",
+        "# Derive the model\n",
+        "model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,scoring='accuracy',verbose=4)\n",
+        "\n",
+        "# Fit training data\n",
+        "model.fit(xtrain.to_numpy(),ytrain.to_numpy().ravel().T)\n",
+        "\n",
+        "# Get the predictions\n",
+        "ypred=model.predict(xtest)\n",
+        "\n",
+        "# Calculate the accuracy of the model\n",
+        "accuracy=accuracy_score(ytest,ypred)\n",
+        "\n",
+        "# Print accuracy score\n",
+        "print('Accuracy Score: '+ str(accuracy*100) + ' %')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/Chapter14/activityPrediction/random_forest_activity_predictor.py b/Chapter14/activityPrediction/random_forest_activity_predictor.py
index daf8121..d6b3e86 100644
--- a/Chapter14/activityPrediction/random_forest_activity_predictor.py
+++ b/Chapter14/activityPrediction/random_forest_activity_predictor.py
@@ -1,3 +1,100 @@
-# TODO: Random Forest for activity prediction on this dataset: 
-#       https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
-# TODO: Code should be well commented.
\ No newline at end of file
+'''Copyright (c) 2021 AIClub
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
+documentation files (the "Software"), to deal in the Software without restriction, including without 
+limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 
+the Software, and to permit persons to whom the Software is furnished to do so, subject to the following 
+conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR OTHER DEALINGS IN THE SOFTWARE.'''
+
+# Random forest for human activity prediction using UCI HAR Dataset.
+
+# Import pandas
+import pandas as pd
+
+# Import RandomForestClassifier since we are using random forest for this problem
+from sklearn.ensemble import RandomForestClassifier
+
+# Import GridSearchCV
+from sklearn.model_selection import GridSearchCV
+
+# Import accuracy_score to calculate accuracy
+from sklearn.metrics import accuracy_score
+
+# Import gdown module to download files from google drive
+import gdown
+
+# Import zip file module to open the zip file
+from zipfile import ZipFile
+
+#--------------------------------------------- Get the file location from google drive  ----------------------------------------
+
+# Please change the URL as needed (make sure you have the access to the file)
+
+url = 'https://drive.google.com/file/d/1z_zn7vv-Sk60fdoQuPN3h9wkyP8H-FQR/view?usp=sharing'
+
+# Derive the file id from the URL
+file_id = url.split('/')[-2]
+
+# Derive the download url of the the file
+download_url = 'https://drive.google.com/uc?id=' + file_id
+
+# Give the location you want to save it in your local machine
+file_location = 'UCI_HAR_Dataset.zip'
+
+#--------------------------------------------- Download and extract the zip file -------------------------------------------------
+
+# Download the file from drive to your local machine
+gdown.download(download_url, file_location)
+
+# Open the downloaded zip file and extract its contents
+with ZipFile(file_location, "r") as zip_file:
+    zip_file.extractall()
+
+#--------------------------------------------- Begin Activity prediction operation -----------------------------------------------
+
+# Read train and test file using pandas
+xtrain=pd.read_table(r'UCI HAR Dataset\train\X_train.txt',delim_whitespace=True,header=None)
+
+
+xtest=pd.read_table(r'UCI HAR Dataset\test\X_test.txt',delim_whitespace=True,header=None)
+
+
+ytrain=pd.read_table(r'UCI HAR Dataset\train\y_train.txt',header=None)
+
+
+ytest=pd.read_table(r'UCI HAR Dataset\test\y_test.txt',header=None)
+
+# Return first 5 raws of the xtrain dataframe
+xtrain.head()
+
+# Initialize randomforest classifier
+classifier = RandomForestClassifier()
+
+# Define parameters for GridSearchCV method below
+parameters = {'n_estimators': [10, 100, 1000], 'max_depth': [3, 6, 9], 'max_features' : ['auto', 'log2']}
+
+# Derive the model
+model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,scoring='accuracy',verbose=4)
+
+# Fit training data
+model.fit(xtrain.to_numpy(),ytrain.to_numpy().ravel().T)
+
+# Get the predictions
+ypred=model.predict(xtest)
+
+# Calculate the accuracy of the model
+accuracy=accuracy_score(ytest,ypred)
+
+# Print accuracy score
+print('Accuracy Score: '+ str(accuracy*100) + ' %')
+
+