diff --git a/2019BTECS00061-asign2.pdf b/2019BTECS00061-asign2.pdf new file mode 100644 index 0000000..893c691 Binary files /dev/null and b/2019BTECS00061-asign2.pdf differ diff --git a/SET2_Linear_Regression.ipynb b/SET2_Linear_Regression.ipynb new file mode 100644 index 0000000..1f8ec32 --- /dev/null +++ b/SET2_Linear_Regression.ipynb @@ -0,0 +1,724 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "SET2 - Linear Regression.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "We shall implement Linear Regression using the Scikit for the dataset of Air Quality. Further we shall test our model for the predictions." + ], + "metadata": { + "id": "Jh--3_JsS4XO" + } + }, + { + "cell_type": "code", + "source": [ + "# Let's import all required libraries\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import mean_absolute_percentage_error\n", + "from sklearn.metrics import precision_score\n", + "import matplotlib.pyplot as plt" + ], + "metadata": { + "id": "aa2VtFQ_TUlA" + }, + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Import dataset\n", + "\n", + "air_data = pd.read_csv('https://github.com/rising-entropy/datasets/raw/main/AirQualityUCI.csv')\n", + "air_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "bqzocoBuUDAw", + "outputId": "816399f7-3acc-487a-a8af-5d15e5d9beaa" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateTimeCO(GT)PT08.S1(CO)NMHC(GT)C6H6(GT)PT08.S2(NMHC)NOx(GT)PT08.S3(NOx)NO2(GT)PT08.S4(NO2)PT08.S5(O3)TRHAHUnnamed: 15Unnamed: 16
010-03-200418:00:002.61360.0150.011.91046.0166.01056.0113.01692.01268.013.648.90.7578NaNNaN
110-03-200419:00:002.01292.0112.09.4955.0103.01174.092.01559.0972.013.347.70.7255NaNNaN
210-03-200420:00:002.21402.088.09.0939.0131.01140.0114.01555.01074.011.954.00.7502NaNNaN
310-03-200421:00:002.21376.080.09.2948.0172.01092.0122.01584.01203.011.060.00.7867NaNNaN
410-03-200422:00:001.61272.051.06.5836.0131.01205.0116.01490.01110.011.259.60.7888NaNNaN
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " Date Time CO(GT) ... AH Unnamed: 15 Unnamed: 16\n", + "0 10-03-2004 18:00:00 2.6 ... 0.7578 NaN NaN\n", + "1 10-03-2004 19:00:00 2.0 ... 0.7255 NaN NaN\n", + "2 10-03-2004 20:00:00 2.2 ... 0.7502 NaN NaN\n", + "3 10-03-2004 21:00:00 2.2 ... 0.7867 NaN NaN\n", + "4 10-03-2004 22:00:00 1.6 ... 0.7888 NaN NaN\n", + "\n", + "[5 rows x 17 columns]" + ] + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Looking for missing values in the table.\n", + "air_data.isnull().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MfZaBLQTa0ar", + "outputId": "28b64c3d-ee39-4c1c-880d-8dabafcc00b5" + }, + "execution_count": 50, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Date 114\n", + "Time 114\n", + "CO(GT) 114\n", + "PT08.S1(CO) 114\n", + "NMHC(GT) 114\n", + "C6H6(GT) 114\n", + "PT08.S2(NMHC) 114\n", + "NOx(GT) 114\n", + "PT08.S3(NOx) 114\n", + "NO2(GT) 114\n", + "PT08.S4(NO2) 114\n", + "PT08.S5(O3) 114\n", + "T 114\n", + "RH 114\n", + "AH 114\n", + "Unnamed: 15 9471\n", + "Unnamed: 16 9471\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 50 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n" + ], + "metadata": { + "id": "1npwHEKLSyQo" + } + }, + { + "cell_type": "code", + "source": [ + "# Handling the missing values\n", + "\n", + "# Drop Date and Time\n", + "air_data = air_data.drop(columns='Date', axis=1)\n", + "air_data = air_data.drop(columns='Time', axis=1)\n", + "air_data = air_data.drop(columns='Unnamed: 15', axis=1)\n", + "air_data = air_data.drop(columns='Unnamed: 16', axis=1)\n", + "air_data = air_data.drop(columns='RH', axis=1)\n", + "air_data = air_data.drop(columns='AH', axis=1)\n", + "air_data = air_data.drop(columns='T', axis=1)\n", + "\n", + "# Replacing missing values with mean\n", + "air_data['CO(GT)'].fillna(air_data['CO(GT)'].mean(), inplace=True)\n", + "air_data['PT08.S1(CO)'].fillna(air_data['PT08.S1(CO)'].mean(), inplace=True)\n", + "air_data['NMHC(GT)'].fillna(air_data['NMHC(GT)'].mean(), inplace=True)\n", + "air_data['C6H6(GT)'].fillna(air_data['C6H6(GT)'].mean(), inplace=True)\n", + "air_data['PT08.S2(NMHC)'].fillna(air_data['PT08.S2(NMHC)'].mean(), inplace=True)\n", + "air_data['NOx(GT)'].fillna(air_data['NOx(GT)'].mean(), inplace=True)\n", + "air_data['PT08.S3(NOx)'].fillna(air_data['PT08.S3(NOx)'].mean(), inplace=True)\n", + "air_data['NO2(GT)'].fillna(air_data['NO2(GT)'].mean(), inplace=True)\n", + "air_data['PT08.S4(NO2)'].fillna(air_data['PT08.S4(NO2)'].mean(), inplace=True)\n", + "air_data['PT08.S5(O3)'].fillna(air_data['PT08.S5(O3)'].mean(), inplace=True)\n", + "\n", + "air_data.isnull().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hifjH48hazHK", + "outputId": "de27aded-3fe7-491e-c8ef-5dc67c3762dd" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "CO(GT) 0\n", + "PT08.S1(CO) 0\n", + "NMHC(GT) 0\n", + "C6H6(GT) 0\n", + "PT08.S2(NMHC) 0\n", + "NOx(GT) 0\n", + "PT08.S3(NOx) 0\n", + "NO2(GT) 0\n", + "PT08.S4(NO2) 0\n", + "PT08.S5(O3) 0\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 51 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Get dataset head\n", + "\n", + "air_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "Z59uiNOg44w-", + "outputId": "080cad8e-a316-4d75-a1df-36f2d146ad08" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CO(GT)PT08.S1(CO)NMHC(GT)C6H6(GT)PT08.S2(NMHC)NOx(GT)PT08.S3(NOx)NO2(GT)PT08.S4(NO2)PT08.S5(O3)
02.61360.0150.011.91046.0166.01056.0113.01692.01268.0
12.01292.0112.09.4955.0103.01174.092.01559.0972.0
22.21402.088.09.0939.0131.01140.0114.01555.01074.0
32.21376.080.09.2948.0172.01092.0122.01584.01203.0
41.61272.051.06.5836.0131.01205.0116.01490.01110.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " CO(GT) PT08.S1(CO) NMHC(GT) ... NO2(GT) PT08.S4(NO2) PT08.S5(O3)\n", + "0 2.6 1360.0 150.0 ... 113.0 1692.0 1268.0\n", + "1 2.0 1292.0 112.0 ... 92.0 1559.0 972.0\n", + "2 2.2 1402.0 88.0 ... 114.0 1555.0 1074.0\n", + "3 2.2 1376.0 80.0 ... 122.0 1584.0 1203.0\n", + "4 1.6 1272.0 51.0 ... 116.0 1490.0 1110.0\n", + "\n", + "[5 rows x 10 columns]" + ] + }, + "metadata": {}, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Splitting the dataset\n", + "X = air_data.iloc[:, :-1].values\n", + "Y = air_data.iloc[:, -1:].values\n", + "\n", + "# We leave 0.2 for testing\n", + "X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)" + ], + "metadata": { + "id": "wwfjU7eM8BZr" + }, + "execution_count": 53, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Perform Linear Regression training data model\n", + "model = LinearRegression()\n", + "model.fit(X_train, Y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "25daJepp8SIN", + "outputId": "a3364493-2828-4408-894f-1ed11875b58d" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "metadata": {}, + "execution_count": 54 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# We now test the model\n", + "# We shall take the averages\n", + "\n", + "X_test_prediction = model.predict(X_test)\n", + "theCombinedFractionDiff = 0\n", + "for i in range(len(Y_test)):\n", + " theDiff = abs(Y_test[i][0] - X_test_prediction[i][0])\n", + " theFractionDiff = theDiff/Y_test[i][0]\n", + " theCombinedFractionDiff += theFractionDiff\n", + "theValue = theCombinedFractionDiff/len(Y_test)\n", + "\n", + "print(\"The Error is\", theValue, \"fractions average.\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MA9tylL0-CTf", + "outputId": "f336f822-a195-4e0d-fef4-59790eea488b" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The Error is 0.10333011859351106 fractions average.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Thus, we performed Linear Regression Pre-processing, Training and Testing over Air Quality dataset." + ], + "metadata": { + "id": "2EtlPWp2Sku8" + } + } + ] +} \ No newline at end of file diff --git a/Student assignment updates.txt b/Student assignment updates.txt index 9979d7c..1dd96aa 100644 --- a/Student assignment updates.txt +++ b/Student assignment updates.txt @@ -1,2 +1,2 @@ -Write your name and PRN no -Hello Updated +ROHIT BIRADAR +2019BTECS00061