From c48ec9657d407dff6703d632b0586dea2ee98136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdanielmdepaoli=E2=80=9D?= <“danielmdepaoli@gmail.com”> Date: Mon, 14 Aug 2023 09:00:29 +0100 Subject: [PATCH] Lab Done --- your-code/main.ipynb | 946 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 901 insertions(+), 45 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 59b955a..333dd37 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,12 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# import numpy and pandas\n", - "\n" + "import pandas as pd \n", + "import numpy as np\n", + "import scipy.stats as st " ] }, { @@ -31,11 +32,218 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0AARON, JEFFERY MSERGEANTPOLICEFSalaryNaN101442.0NaN
1AARON, KARINAPOLICE OFFICER (ASSIGNED AS DETECTIVE)POLICEFSalaryNaN94122.0NaN
2AARON, KIMBERLEI RCHIEF CONTRACT EXPEDITERGENERAL SERVICESFSalaryNaN101592.0NaN
3ABAD JR, VICENTE MCIVIL ENGINEER IVWATER MGMNTFSalaryNaN110064.0NaN
4ABASCAL, REECE ETRAFFIC CONTROL AIDE-HOURLYOEMCPHourly20.0NaN19.86
...........................
33178ZYLINSKA, KATARZYNAPOLICE OFFICERPOLICEFSalaryNaN72510.0NaN
33179ZYMANTAS, LAURA CPOLICE OFFICERPOLICEFSalaryNaN48078.0NaN
33180ZYMANTAS, MARK EPOLICE OFFICERPOLICEFSalaryNaN90024.0NaN
33181ZYRKOWSKI, CARLO EPOLICE OFFICERPOLICEFSalaryNaN93354.0NaN
33182ZYSKOWSKI, DARIUSZCHIEF DATA BASE ANALYSTDoITFSalaryNaN115932.0NaN
\n", + "

33183 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Name Job Titles \\\n", + "0 AARON, JEFFERY M SERGEANT \n", + "1 AARON, KARINA POLICE OFFICER (ASSIGNED AS DETECTIVE) \n", + "2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", + "3 ABAD JR, VICENTE M CIVIL ENGINEER IV \n", + "4 ABASCAL, REECE E TRAFFIC CONTROL AIDE-HOURLY \n", + "... ... ... \n", + "33178 ZYLINSKA, KATARZYNA POLICE OFFICER \n", + "33179 ZYMANTAS, LAURA C POLICE OFFICER \n", + "33180 ZYMANTAS, MARK E POLICE OFFICER \n", + "33181 ZYRKOWSKI, CARLO E POLICE OFFICER \n", + "33182 ZYSKOWSKI, DARIUSZ CHIEF DATA BASE ANALYST \n", + "\n", + " Department Full or Part-Time Salary or Hourly Typical Hours \\\n", + "0 POLICE F Salary NaN \n", + "1 POLICE F Salary NaN \n", + "2 GENERAL SERVICES F Salary NaN \n", + "3 WATER MGMNT F Salary NaN \n", + "4 OEMC P Hourly 20.0 \n", + "... ... ... ... ... \n", + "33178 POLICE F Salary NaN \n", + "33179 POLICE F Salary NaN \n", + "33180 POLICE F Salary NaN \n", + "33181 POLICE F Salary NaN \n", + "33182 DoIT F Salary NaN \n", + "\n", + " Annual Salary Hourly Rate \n", + "0 101442.0 NaN \n", + "1 94122.0 NaN \n", + "2 101592.0 NaN \n", + "3 110064.0 NaN \n", + "4 NaN 19.86 \n", + "... ... ... \n", + "33178 72510.0 NaN \n", + "33179 48078.0 NaN \n", + "33180 90024.0 NaN \n", + "33181 93354.0 NaN \n", + "33182 115932.0 NaN \n", + "\n", + "[33183 rows x 8 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "employees = pd.read_csv(\"Current_Employee_Names__Salaries__and_Position_Titles.csv\")\n", + "employees" ] }, { @@ -47,12 +255,130 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0AARON, JEFFERY MSERGEANTPOLICEFSalaryNaN101442.0NaN
1AARON, KARINAPOLICE OFFICER (ASSIGNED AS DETECTIVE)POLICEFSalaryNaN94122.0NaN
2AARON, KIMBERLEI RCHIEF CONTRACT EXPEDITERGENERAL SERVICESFSalaryNaN101592.0NaN
3ABAD JR, VICENTE MCIVIL ENGINEER IVWATER MGMNTFSalaryNaN110064.0NaN
4ABASCAL, REECE ETRAFFIC CONTROL AIDE-HOURLYOEMCPHourly20.0NaN19.86
\n", + "
" + ], + "text/plain": [ + " Name Job Titles \\\n", + "0 AARON, JEFFERY M SERGEANT \n", + "1 AARON, KARINA POLICE OFFICER (ASSIGNED AS DETECTIVE) \n", + "2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", + "3 ABAD JR, VICENTE M CIVIL ENGINEER IV \n", + "4 ABASCAL, REECE E TRAFFIC CONTROL AIDE-HOURLY \n", + "\n", + " Department Full or Part-Time Salary or Hourly Typical Hours \\\n", + "0 POLICE F Salary NaN \n", + "1 POLICE F Salary NaN \n", + "2 GENERAL SERVICES F Salary NaN \n", + "3 WATER MGMNT F Salary NaN \n", + "4 OEMC P Hourly 20.0 \n", + "\n", + " Annual Salary Hourly Rate \n", + "0 101442.0 NaN \n", + "1 94122.0 NaN \n", + "2 101592.0 NaN \n", + "3 110064.0 NaN \n", + "4 NaN 19.86 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "employees.head()" ] }, { @@ -64,12 +390,206 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0FalseFalseFalseFalseFalseTrueFalseTrue
1FalseFalseFalseFalseFalseTrueFalseTrue
2FalseFalseFalseFalseFalseTrueFalseTrue
3FalseFalseFalseFalseFalseTrueFalseTrue
4FalseFalseFalseFalseFalseFalseTrueFalse
...........................
33178FalseFalseFalseFalseFalseTrueFalseTrue
33179FalseFalseFalseFalseFalseTrueFalseTrue
33180FalseFalseFalseFalseFalseTrueFalseTrue
33181FalseFalseFalseFalseFalseTrueFalseTrue
33182FalseFalseFalseFalseFalseTrueFalseTrue
\n", + "

33183 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Name Job Titles Department Full or Part-Time Salary or Hourly \\\n", + "0 False False False False False \n", + "1 False False False False False \n", + "2 False False False False False \n", + "3 False False False False False \n", + "4 False False False False False \n", + "... ... ... ... ... ... \n", + "33178 False False False False False \n", + "33179 False False False False False \n", + "33180 False False False False False \n", + "33181 False False False False False \n", + "33182 False False False False False \n", + "\n", + " Typical Hours Annual Salary Hourly Rate \n", + "0 True False True \n", + "1 True False True \n", + "2 True False True \n", + "3 True False True \n", + "4 False True False \n", + "... ... ... ... \n", + "33178 True False True \n", + "33179 True False True \n", + "33180 True False True \n", + "33181 True False True \n", + "33182 True False True \n", + "\n", + "[33183 rows x 8 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "nan_values = employees.isna()\n", + "\n", + "nan_values" ] }, { @@ -81,12 +601,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Salary 25161\n", + "Hourly 8022\n", + "Name: Salary or Hourly, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "employees[\"Salary or Hourly\"].value_counts()" ] }, { @@ -105,12 +637,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "POLICE 13414\n", + "FIRE 4641\n", + "STREETS & SAN 2198\n", + "OEMC 2102\n", + "WATER MGMNT 1879\n", + "AVIATION 1629\n", + "TRANSPORTN 1140\n", + "PUBLIC LIBRARY 1015\n", + "GENERAL SERVICES 980\n", + "FAMILY & SUPPORT 615\n", + "FINANCE 560\n", + "HEALTH 488\n", + "CITY COUNCIL 411\n", + "LAW 407\n", + "BUILDINGS 269\n", + "COMMUNITY DEVELOPMENT 207\n", + "BUSINESS AFFAIRS 171\n", + "COPA 116\n", + "BOARD OF ELECTION 107\n", + "DoIT 99\n", + "PROCUREMENT 92\n", + "INSPECTOR GEN 87\n", + "MAYOR'S OFFICE 85\n", + "CITY CLERK 84\n", + "ANIMAL CONTRL 81\n", + "HUMAN RESOURCES 79\n", + "CULTURAL AFFAIRS 65\n", + "BUDGET & MGMT 46\n", + "ADMIN HEARNG 39\n", + "DISABILITIES 28\n", + "TREASURER 22\n", + "HUMAN RELATIONS 16\n", + "BOARD OF ETHICS 8\n", + "POLICE BOARD 2\n", + "LICENSE APPL COMM 1\n", + "Name: Department, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "employees[\"Department\"].value_counts()" ] }, { @@ -122,14 +699,105 @@ "In this section of the lab, we will test whether the hourly wage of all hourly workers is significantly different from $30/hr. Import the correct one sample test function from scipy and perform the hypothesis test for a 95% two sided confidence interval." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H0 = 30\n", + "\n", + "H1 != 30" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "4 19.86\n", + "6 46.10\n", + "7 35.60\n", + "10 2.65\n", + "18 17.68\n", + " ... \n", + "33164 46.10\n", + "33168 17.68\n", + "33169 35.60\n", + "33174 46.35\n", + "33175 48.85\n", + "Name: Hourly Rate, Length: 8022, dtype: float64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "employees[employees['Salary or Hourly'] == \"Hourly\"][\"Hourly Rate\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 19.86\n", + "6 46.10\n", + "7 35.60\n", + "10 2.65\n", + "18 17.68\n", + " ... \n", + "33164 46.10\n", + "33168 17.68\n", + "33169 35.60\n", + "33174 46.35\n", + "33175 48.85\n", + "Name: Hourly Rate, Length: 8022, dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alpha = 0.05\n", + "\n", + "hourly_sample = employees[employees['Salary or Hourly'] == \"Hourly\"][\"Hourly Rate\"]\n", + "hourly_sample" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=20.6198057854942, pvalue=4.3230240486229894e-92, df=8021)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "st.ttest_1samp(hourly_sample, 30)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "REJECTED" ] }, { @@ -141,14 +809,90 @@ "Hint: A one tailed test has a p-value that is half of the two tailed p-value. If our hypothesis is greater than, then to reject, the test statistic must also be positive." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H0 mu >= 86000\n", + "\n", + "H1 mu < 86000" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 101442.0\n", + "1 94122.0\n", + "9 93354.0\n", + "11 84054.0\n", + "12 87006.0\n", + " ... \n", + "33177 72510.0\n", + "33178 72510.0\n", + "33179 48078.0\n", + "33180 90024.0\n", + "33181 93354.0\n", + "Name: Annual Salary, Length: 13414, dtype: float64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_salary = employees[employees['Department'] == \"POLICE\"][\"Annual Salary\"]\n", + "police_salary" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n", - "\n" + "police_salary1 = police_salary.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "alpha = 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=3.081997005712994, pvalue=0.9989698298224517, df=13403)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "st.ttest_1samp(police_salary1, 86000, alternative = \"less\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NOT REJECTED" ] }, { @@ -160,12 +904,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n", - "\n" + "cross_tab = pd.crosstab(employees['Department'], employees['Salary or Hourly'])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "department_with_most_hourly_workers = cross_tab['Hourly'].idxmax()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'STREETS & SAN'" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "department_with_most_hourly_workers" ] }, { @@ -175,14 +947,60 @@ "The workers from the department with the most hourly workers have complained that their hourly wage is less than $35/hour. Using a one sample t-test, test this one-sided hypothesis at the 95% confidence level." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H0 = mu Salary => 35\n", + "\n", + "H1 = mu Salary < 35" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n", - "\n" + "wage_sample = employees[employees['Department'] =='STREETS & SAN']['Hourly Rate']" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "wage_sample1 = wage_sample.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=-9.567447887848152, pvalue=1.6689265282353859e-21, df=1861)" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alpha = 0.05\n", + "\n", + "st.ttest_1samp(wage_sample1, 35, alternative = \"less\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "REJECTED" ] }, { @@ -206,12 +1024,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confidence interval of the hourly wage: (32.52345834488425, 33.05365708767623)\n" + ] + } + ], "source": [ - "# Your code here:\n", - "\n" + "confidence_level = 0.95\n", + "\n", + "hourly_sample\n", + "\n", + "ddof = len(hourly_sample) -1\n", + "\n", + "mean = hourly_sample.mean()\n", + "\n", + "standard_error = st.sem(hourly_sample)\n", + "\n", + "confidence_interval = st.t.interval(confidence_level, df=ddof, loc=mean, scale=standard_error)\n", + "\n", + "print(f\"Confidence interval of the hourly wage: {confidence_interval}\")" ] }, { @@ -223,12 +1060,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confidence interval of the hourly wage: (86177.05631531784, 86795.77269094894)\n" + ] + } + ], "source": [ - "# Your code here:\n", - "\n" + "confidence_level = 0.95\n", + "\n", + "police_salary1\n", + "\n", + "ddof = len(police_salary1) -1\n", + "\n", + "mean = police_salary1.mean()\n", + "\n", + "standard_error = st.sem(police_salary1)\n", + "\n", + "confidence_interval = st.t.interval(confidence_level, df=ddof, loc=mean, scale=standard_error)\n", + "\n", + "print(f\"Confidence interval of the hourly wage: {confidence_interval}\")" ] }, { @@ -257,7 +1113,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -271,7 +1127,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.9" } }, "nbformat": 4,