From dd3d04f88a4793b804985f052f24dca0c218f0d2 Mon Sep 17 00:00:00 2001 From: ChathukiKet Date: Wed, 20 Oct 2021 12:28:59 +0530 Subject: [PATCH 1/2] chapter 10 - logistic regression model for child vs adult dataset --- Chapter10/LogisticRegression/README.md | 23 +- .../logistic_regression.ipynb | 287 ++++++++++++++++++ .../LogisticRegression/logistic_regression.py | 103 ++++++- 3 files changed, 409 insertions(+), 4 deletions(-) create mode 100644 Chapter10/LogisticRegression/logistic_regression.ipynb diff --git a/Chapter10/LogisticRegression/README.md b/Chapter10/LogisticRegression/README.md index 5c4a0ff..d24ae89 100644 --- a/Chapter10/LogisticRegression/README.md +++ b/Chapter10/LogisticRegression/README.md @@ -1 +1,22 @@ -This readme should describe 'what is in the python code'. \ No newline at end of file +What it does : + + 1. Reads a csv file from the google drive, then create a logistic regression model on the child vs adult dataset. Calculate the accuracy and confusion matrix. + +Dependancies : + + 1. sklearn module is needed to be installed in the local machine. + + 2. pandas module is needed to be installed in the local machine, to read CSV. + + 3. gdown module is needed to be installed in the local machine, to download the CSV file from the google drive. + + +Things to check before running : + + 1. Check whether you have access to the internet. + + 2. Check whether you have given the correct file location of the csv file. + + 3. Check whether you have access to the file. + + 4. Check whether the file format is correct. \ No newline at end of file diff --git a/Chapter10/LogisticRegression/logistic_regression.ipynb b/Chapter10/LogisticRegression/logistic_regression.ipynb new file mode 100644 index 0000000..2564a8b --- /dev/null +++ b/Chapter10/LogisticRegression/logistic_regression.ipynb @@ -0,0 +1,287 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "logistic_regressor.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "1xWA9Xxd-BT2" + }, + "source": [ + "# **Problem: Logistic Regression Model**\n", + "\n", + "The python program reads a csv file (child vs adult dataset) from the google drive, and creates a logistic regression model on that dataset. Calculate the confusion matrix and accuracy. \n", + "\n", + "**Examples:**\n", + "\n", + "Input -->\n", + "\n", + "> url = 'https://drive.google.com/file/d/1J5z8OsAtgSp9i1eLxQFoxVexSZJuhI_-/view?usp=sharing'\n", + "\n", + "Output -->\n", + "\n", + "> Confucion matrix = [[488 12]\n", + " [163 337]]\n", + "\n", + "> Accuracy = 0.825\n", + "\n", + "**Notes:**\n", + "\n", + "Following things are needed to be checked before running the program.\n", + "* Check whether you have given the correct location of your csv file.\n", + "* Check whether you file format is correct.\n", + "* Check whether you have access to the file." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5zlOyhAE7e5k" + }, + "source": [ + "#@title MIT License\n", + "\n", + "# Copyright (c) 2021 AIClub\n", + "\n", + "# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated \n", + "# documentation files (the \"Software\"), to deal in the Software without restriction, including without \n", + "# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of \n", + "# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following \n", + "# conditions:\n", + "\n", + "# The above copyright notice and this permission notice shall be included in all copies or substantial\n", + "# portions of the Software.\n", + "\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT \n", + "# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO \n", + "# EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN \n", + "# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE \n", + "# OR OTHER DEALINGS IN THE SOFTWARE." + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VBqjhcUbc-72" + }, + "source": [ + "# Import modules" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9S-OD3v0cTxA" + }, + "source": [ + "# Import pandas module to read the CSV file and to process the tabular data\n", + "import pandas as pd\n", + "\n", + "# Import train_test_split to split data as test and train\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Import LogisticRegression class\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Import accuracy_score and confusion_matrix functions for accuracy calculations\n", + "from sklearn.metrics import accuracy_score, confusion_matrix\n", + "\n", + "# Import gdown module to download files from google drive\n", + "import gdown" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NC5yAkbHdCZm" + }, + "source": [ + "# Get the file location from the google drive" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hR2Ar8XHdoav" + }, + "source": [ + "# Please change the url as needed (make sure you have the access to the file)\n", + "url = \"https://drive.google.com/file/d/1J5z8OsAtgSp9i1eLxQFoxVexSZJuhI_-/view?usp=sharing\" \n", + "\n", + "# Derive the file id from the url\n", + "file_id = url.split('/')[-2]\n", + "\n", + "# Derive the download url of the file\n", + "download_url = 'https://drive.google.com/uc?id=' + file_id\n", + "\n", + "# Give the file name you want to save it \n", + "file_name = \"child_vs_adult.csv\" \n", + "\n", + "# Derive the file location\n", + "file_location = \"/content/\" + file_name" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gd1WH9wQdou7" + }, + "source": [ + "# Downloading, creating of logistic regression model and calculating of metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMe4N40Uh5fe" + }, + "source": [ + "Download and read the CSV file" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ouYfE4areWFk" + }, + "source": [ + "# Download the file from drive\n", + "gdown.download(download_url, file_location, quiet=False)\n", + "\n", + "# Read the CSV file\n", + "data = pd.read_csv(file_location)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GHfLu6-aggWJ" + }, + "source": [ + "Display a sample from the dataset" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ipsfzY3BgqAE" + }, + "source": [ + "# Print a sample from the csv dataset\n", + "print('---------- First 5 rows of the Dataset ----------\\n', data.head())" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T7pn823DhbAi" + }, + "source": [ + "Create the logistic regression model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I-LutOhihk1-" + }, + "source": [ + "# Please change the target variable according to the dataset\n", + "# You can refer to the dataset details printed in the above step \n", + "target_column = 'who_am_I'\n", + "\n", + "# Seperate the training data by removing the target column\n", + "X = data.drop(columns=[target_column])\n", + "\n", + "# Separate the target values\n", + "y = data[target_column].values\n", + "\n", + "# Split the dataset into train and test data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)\n", + "\n", + "# Create logistic regressor\n", + "logistic_regressor = LogisticRegression()\n", + "\n", + "# Train the model\n", + "logistic_regressor.fit(X_train, y_train)\n", + "\n", + "# Predict using test values\n", + "y_pred = logistic_regressor.predict(X_test)\n", + "\n", + "# Get actual values and predicted values into a table\n", + "predicted_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})\n", + "print('---------- Predicted Results ----------\\n', predicted_results)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RxAogMqQhlcW" + }, + "source": [ + "Confusion matrix" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qSMrcrFYhuPX" + }, + "source": [ + "# Calculate the confusion matrix\n", + "confusion_mat = confusion_matrix(y_test, y_pred)\n", + "print ('---------- Confusion Matrix of Your Model -----------\\n', confusion_mat)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eEpwTG1oAtls" + }, + "source": [ + "Accuracy" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EEfpac8-AveP" + }, + "source": [ + "# Calculate accuracy using 'accuracy_score'\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print('---------- Accuracy of Your Model -----------\\n', accuracy)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Chapter10/LogisticRegression/logistic_regression.py b/Chapter10/LogisticRegression/logistic_regression.py index 7e0bdce..bc3813f 100644 --- a/Chapter10/LogisticRegression/logistic_regression.py +++ b/Chapter10/LogisticRegression/logistic_regression.py @@ -1,3 +1,100 @@ -# TODO: Create a logistic regression model (using scikit learn) on the child vs adult dataset. -# Calculate the accuracy and confusion matrix -# TODO: Code should be well commented. \ No newline at end of file +'''Copyright (c) 2021 AIClub + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the Software without restriction, including without +limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE.''' + + +# Python program to create a logistic regression model (using scikit learn) on the child vs adult dataset. +# Calculate the accuracy and confusion matrix + + +# Import pandas module to read the CSV file and to process the tabular data +import pandas as pd + +# Import train_test_split to split data as test and train +from sklearn.model_selection import train_test_split + +# Import LogisticRegression class +from sklearn.linear_model import LogisticRegression + +# Import accuracy_score and confusion_matrix functions for accuracy calculations +from sklearn.metrics import accuracy_score, confusion_matrix + +# Import gdown module to download files from google drive +import gdown + + +# ------------------------------------ Get the file from the google drive. ------------------------------------------ + +# Please change the url as needed (make sure you have the access to the file) +url = 'https://drive.google.com/file/d/1J5z8OsAtgSp9i1eLxQFoxVexSZJuhI_-/view?usp=sharing' + +# Derive the file id from the url +file_id = url.split('/')[-2] + +# Derive the download url of the file +download_url = 'https://drive.google.com/uc?id=' + file_id + +# Give the location you want to save it in your local machine +file_location = r'child vs adult.csv' + +# Download the file from drive to your local machine +gdown.download(download_url, file_location) + + +# ------------------------------------ Create the Logistic regression model ----------------------------------------- + +# Read the CSV file +data = pd.read_csv(file_location) + +# Print a sample from the csv dataset +print('\n---------- First 5 rows of the Dataset ----------\n', data.head()) + +# Please change the target variable according to the dataset +# You can refer to the dataset details printed in the above step +target_column = 'who_am_I' + +# Seperate the training data by removing the target column +X = data.drop(columns=[target_column]) + +# Separate the target values +y = data[target_column].values + +# Split the dataset into train and test data +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + +# Create logistic regressor +logistic_regressor = LogisticRegression() + +# Train the model +logistic_regressor.fit(X_train, y_train) + +# Predict using test values +y_pred = logistic_regressor.predict(X_test) + +# Get actual values and predicted values into a table +predicted_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) +print('\n---------- Predicted Results ----------\n', predicted_results) + + +# ------------------------------- Calculate the Accuracy and Confusion Matrix -------------------------------------- + +# Calculate the confusion matrix +confusion_mat = confusion_matrix(y_test, y_pred) +print ('\n---------- Confusion Matrix of Your Model -----------\n', confusion_mat) + +# Calculate accuracy using 'accuracy_score' +accuracy = accuracy_score(y_test, y_pred) +print('\n---------- Accuracy of Your Model -----------\n', accuracy) \ No newline at end of file From d8b1f5aeba3a233aaea52225f28037ed5de909d5 Mon Sep 17 00:00:00 2001 From: ChathukiKet Date: Fri, 26 Nov 2021 23:06:00 +0530 Subject: [PATCH 2/2] remove the print statement --- Chapter10/LogisticRegression/logistic_regression.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Chapter10/LogisticRegression/logistic_regression.py b/Chapter10/LogisticRegression/logistic_regression.py index bc3813f..9b1281f 100644 --- a/Chapter10/LogisticRegression/logistic_regression.py +++ b/Chapter10/LogisticRegression/logistic_regression.py @@ -84,10 +84,6 @@ # Predict using test values y_pred = logistic_regressor.predict(X_test) -# Get actual values and predicted values into a table -predicted_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) -print('\n---------- Predicted Results ----------\n', predicted_results) - # ------------------------------- Calculate the Accuracy and Confusion Matrix --------------------------------------