diff --git a/SET Assignment2.ipynb b/SET Assignment2.ipynb new file mode 100644 index 0000000..e8028ed --- /dev/null +++ b/SET Assignment2.ipynb @@ -0,0 +1,993 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "SETAssignment2.ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nf8JtyyGzdDR" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv\")\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 704 + }, + "id": "9OHwTfqT0WWh", + "outputId": "1b61eac6-4ca1-43e7-a962-8e7444c74bff" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearStartYearEndLocationAbbrLocationDescDatasourceClassTopicQuestionData_Value_UnitData_Value_TypeData_ValueData_Value_AltData_Value_Footnote_SymbolData_Value_FootnoteLow_Confidence_LimitHigh_Confidence_LimitSample_SizeTotalAge(years)EducationGenderIncomeRace/EthnicityGeoLocationClassIDTopicIDQuestionIDDataValueTypeIDLocationIDStratificationCategory1Stratification1StratificationCategoryId1StratificationID1
020112011ALAlabamaBehavioral Risk Factor Surveillance SystemObesity / Weight StatusObesity / Weight StatusPercent of adults aged 18 years and older who ...NaNValue32.032.0NaNNaN30.533.57304.0TotalNaNNaNNaNNaNNaN(32.84057112200048, -86.63186076199969)OWSOWS1Q036VALUE1.0TotalTotalOVROVERALL
120112011ALAlabamaBehavioral Risk Factor Surveillance SystemObesity / Weight StatusObesity / Weight StatusPercent of adults aged 18 years and older who ...NaNValue32.332.3NaNNaN29.934.72581.0NaNNaNNaNMaleNaNNaN(32.84057112200048, -86.63186076199969)OWSOWS1Q036VALUE1.0GenderMaleGENMALE
220112011ALAlabamaBehavioral Risk Factor Surveillance SystemObesity / Weight StatusObesity / Weight StatusPercent of adults aged 18 years and older who ...NaNValue31.831.8NaNNaN30.033.64723.0NaNNaNNaNFemaleNaNNaN(32.84057112200048, -86.63186076199969)OWSOWS1Q036VALUE1.0GenderFemaleGENFEMALE
320112011ALAlabamaBehavioral Risk Factor Surveillance SystemObesity / Weight StatusObesity / Weight StatusPercent of adults aged 18 years and older who ...NaNValue33.633.6NaNNaN29.937.61153.0NaNNaNLess than high schoolNaNNaNNaN(32.84057112200048, -86.63186076199969)OWSOWS1Q036VALUE1.0EducationLess than high schoolEDUEDUHS
420112011ALAlabamaBehavioral Risk Factor Surveillance SystemObesity / Weight StatusObesity / Weight StatusPercent of adults aged 18 years and older who ...NaNValue32.832.8NaNNaN30.235.62402.0NaNNaNHigh school graduateNaNNaNNaN(32.84057112200048, -86.63186076199969)OWSOWS1Q036VALUE1.0EducationHigh school graduateEDUEDUHSGRAD
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " YearStart YearEnd ... StratificationCategoryId1 StratificationID1\n", + "0 2011 2011 ... OVR OVERALL\n", + "1 2011 2011 ... GEN MALE\n", + "2 2011 2011 ... GEN FEMALE\n", + "3 2011 2011 ... EDU EDUHS\n", + "4 2011 2011 ... EDU EDUHSGRAD\n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T9SENeyK0fk8", + "outputId": "530ddc12-06da-40fa-99e5-e7f00bb0faaf" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 5490 entries, 0 to 5489\n", + "Data columns (total 33 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 YearStart 5490 non-null int64 \n", + " 1 YearEnd 5490 non-null int64 \n", + " 2 LocationAbbr 5490 non-null object \n", + " 3 LocationDesc 5490 non-null object \n", + " 4 Datasource 5490 non-null object \n", + " 5 Class 5490 non-null object \n", + " 6 Topic 5490 non-null object \n", + " 7 Question 5490 non-null object \n", + " 8 Data_Value_Unit 0 non-null float64\n", + " 9 Data_Value_Type 5489 non-null object \n", + " 10 Data_Value 5048 non-null float64\n", + " 11 Data_Value_Alt 5048 non-null float64\n", + " 12 Data_Value_Footnote_Symbol 441 non-null object \n", + " 13 Data_Value_Footnote 441 non-null object \n", + " 14 Low_Confidence_Limit 5048 non-null float64\n", + " 15 High_Confidence_Limit 5048 non-null float64\n", + " 16 Sample_Size 5048 non-null float64\n", + " 17 Total 198 non-null object \n", + " 18 Age(years) 1178 non-null object \n", + " 19 Education 784 non-null object \n", + " 20 Gender 398 non-null object \n", + " 21 Income 1368 non-null object \n", + " 22 Race/Ethnicity 1563 non-null object \n", + " 23 GeoLocation 5480 non-null object \n", + " 24 ClassID 5489 non-null object \n", + " 25 TopicID 5489 non-null object \n", + " 26 QuestionID 5489 non-null object \n", + " 27 DataValueTypeID 5489 non-null object \n", + " 28 LocationID 5489 non-null float64\n", + " 29 StratificationCategory1 5489 non-null object \n", + " 30 Stratification1 5489 non-null object \n", + " 31 StratificationCategoryId1 5489 non-null object \n", + " 32 StratificationID1 5489 non-null object \n", + "dtypes: float64(7), int64(2), object(24)\n", + "memory usage: 1.4+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "4uVAtAze0i5t", + "outputId": "b5e31d8a-180d-4e61-d853-f181b4330702" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearStartYearEndData_Value_UnitData_ValueData_Value_AltLow_Confidence_LimitHigh_Confidence_LimitSample_SizeLocationID
count5490.0000005490.0000000.05048.0000005048.0000005048.000005048.0000005048.0000005489.000000
mean2012.2300552012.230055NaN31.08686631.08686626.4916436.3111132009.5342715.814903
std1.0945831.094583NaN10.55968210.55968210.2547211.5821359466.8161153.773033
min2011.0000002011.000000NaN0.9000000.9000000.300003.00000050.0000001.000000
25%2011.0000002011.000000NaN23.70000023.70000019.4000028.200000494.0000004.000000
50%2012.0000002012.000000NaN30.40000030.40000025.7000036.000000994.0000006.000000
75%2013.0000002013.000000NaN37.20000037.20000032.8250042.8000001995.0000009.000000
max2015.0000002015.000000NaN72.30000072.30000067.9000083.200000398316.00000059.000000
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " YearStart YearEnd ... Sample_Size LocationID\n", + "count 5490.000000 5490.000000 ... 5048.000000 5489.000000\n", + "mean 2012.230055 2012.230055 ... 2009.534271 5.814903\n", + "std 1.094583 1.094583 ... 9466.816115 3.773033\n", + "min 2011.000000 2011.000000 ... 50.000000 1.000000\n", + "25% 2011.000000 2011.000000 ... 494.000000 4.000000\n", + "50% 2012.000000 2012.000000 ... 994.000000 6.000000\n", + "75% 2013.000000 2013.000000 ... 1995.000000 9.000000\n", + "max 2015.000000 2015.000000 ... 398316.000000 59.000000\n", + "\n", + "[8 rows x 9 columns]" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ] + }, + { + "cell_type": "code", + "source": [ + "wrangled = df[df['StratificationID1'] == 'OVERALL'][['LocationDesc','Data_Value', 'Question', \"YearStart\" ]]\n", + "question = wrangled[wrangled['Question'] == 'Percent of adults who engage in no leisure-time physical activity'][['LocationDesc','Data_Value', 'Question', \"YearStart\" ]]\n", + "x_all = question[question['YearStart'] == 2014][['LocationDesc','Data_Value' ]]\n", + "x = question[question['YearStart'] == 2014][['Data_Value' ]].values\n", + "x" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nfvngUQ_0j3A", + "outputId": "1dd16745-028c-4eaa-b7be-f9250b989bde" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[27.6],\n", + " [19.2],\n", + " [21.2],\n", + " [30.7],\n", + " [21.7],\n", + " [16.4],\n", + " [20.6],\n", + " [24.9]])" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "source": [ + "wrangled = df[df['StratificationID1'] == 'OVERALL'][['LocationDesc','Data_Value', 'Question', \"YearStart\" ]]\n", + "year = wrangled[wrangled['Question'] == 'Percent of adults aged 18 years and older who have obesity'][['LocationDesc','Data_Value', 'Question', \"YearStart\" ]]\n", + "y_all = year[year['YearStart'] == 2014][['LocationDesc','Data_Value' ]]\n", + "y = year[year['YearStart'] == 2014][['Data_Value' ]].values\n", + "y" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZSaSh-4A0mhY", + "outputId": "e691505b-f468-48a5-90fc-fcb08b7ac38e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[33.5],\n", + " [29.7],\n", + " [28.9],\n", + " [35.9],\n", + " [24.7],\n", + " [21.3],\n", + " [26.3],\n", + " [30.7]])" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x = x.reshape(-1, 1)\n", + "x" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3X4tOzIZ0qfB", + "outputId": "f609deec-1dcc-4fb1-9f1f-3fa641da9693" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[27.6],\n", + " [19.2],\n", + " [21.2],\n", + " [30.7],\n", + " [21.7],\n", + " [16.4],\n", + " [20.6],\n", + " [24.9]])" + ] + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "y = y.reshape(-1, 1)\n", + "y" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "odnC_Qad0s4f", + "outputId": "2a690e65-9ec0-4363-c68f-08c415934a77" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[33.5],\n", + " [29.7],\n", + " [28.9],\n", + " [35.9],\n", + " [24.7],\n", + " [21.3],\n", + " [26.3],\n", + " [30.7]])" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, test_size=.2, random_state=100)" + ], + "metadata": { + "id": "FcsS4k1m0vSK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f'X Train Data shape{x_train.shape}')\n", + "print(f'y Train Data shape{y_train.shape}')\n", + "print(f'X Test Data shape{x_test.shape}')\n", + "print(f'y Test Data shape{y_test.shape}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-6qRaxN00xx5", + "outputId": "f5e9fe50-2599-41d0-8323-5ee03c40af41" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "X Train Data shape(6, 1)\n", + "y Train Data shape(6, 1)\n", + "X Test Data shape(2, 1)\n", + "y Test Data shape(2, 1)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.scatter(x_train, y_train, color='orange')\n", + "plt.xlabel('% Adults with reporting no leisure Physical Activity')\n", + "plt.ylabel('% of Adults who have Obesity')\n", + "plt.title('Physical Data')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "id": "PQfW9bb500zu", + "outputId": "e370fad2-0b49-486f-e2e1-de4d9876efa9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "source": [ + "lm = LinearRegression()\n", + "lm.fit(x_train, y_train)\n", + "y_predict = lm.predict(x_test)" + ], + "metadata": { + "id": "TFQqD_Sf029n" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f'Train Accuracy {round(lm.score(x_train, y_train)* 100,2)}%')\n", + "print(f'Test Accuracy {round(lm.score(x_test, y_test)* 100,2)}%')" + ], + "metadata": { + "id": "Eh3xosm4056N", + "outputId": "9c685099-321c-4a29-9767-41450044cacd", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train Accuracy 96.51%\n", + "Test Accuracy -134.64%\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.scatter(x_train, y_train, color='orange')\n", + "plt.xlabel('% Adults with reporting no leisure Physical Activity')\n", + "plt.ylabel('% of Adults who have Obesity')\n", + "plt.title('Physical Data')\n", + "plt.show()" + ], + "metadata": { + "id": "jBbc59fY08WU", + "outputId": "eebc0794-8e18-47ea-c031-c60739d1bead", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "1B5LoqKA0-d9" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Student assignment updates.txt b/Student assignment updates.txt index 9979d7c..cbc9235 100644 --- a/Student assignment updates.txt +++ b/Student assignment updates.txt @@ -1,2 +1,4 @@ Write your name and PRN no +Pratibha Maind +2019BTECS00088 Hello Updated