diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 59b955a..a5ef640 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -12,15 +13,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import numpy and pandas\n", - "\n" + "\n", + "import numpy as np\n", + "import pandas as pd" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -31,14 +35,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "data = pd.read_csv(\"Current_Employee_Names__Salaries__and_Position_Titles.csv\")\n", + "data.info" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -47,15 +105,136 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0AARON, JEFFERY MSERGEANTPOLICEFSalaryNaN101442.0NaN
1AARON, KARINAPOLICE OFFICER (ASSIGNED AS DETECTIVE)POLICEFSalaryNaN94122.0NaN
2AARON, KIMBERLEI RCHIEF CONTRACT EXPEDITERGENERAL SERVICESFSalaryNaN101592.0NaN
3ABAD JR, VICENTE MCIVIL ENGINEER IVWATER MGMNTFSalaryNaN110064.0NaN
4ABASCAL, REECE ETRAFFIC CONTROL AIDE-HOURLYOEMCPHourly20.0NaN19.86
\n", + "
" + ], + "text/plain": [ + " Name Job Titles \n", + "0 AARON, JEFFERY M SERGEANT \\\n", + "1 AARON, KARINA POLICE OFFICER (ASSIGNED AS DETECTIVE) \n", + "2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", + "3 ABAD JR, VICENTE M CIVIL ENGINEER IV \n", + "4 ABASCAL, REECE E TRAFFIC CONTROL AIDE-HOURLY \n", + "\n", + " Department Full or Part-Time Salary or Hourly Typical Hours \n", + "0 POLICE F Salary NaN \\\n", + "1 POLICE F Salary NaN \n", + "2 GENERAL SERVICES F Salary NaN \n", + "3 WATER MGMNT F Salary NaN \n", + "4 OEMC P Hourly 20.0 \n", + "\n", + " Annual Salary Hourly Rate \n", + "0 101442.0 NaN \n", + "1 94122.0 NaN \n", + "2 101592.0 NaN \n", + "3 110064.0 NaN \n", + "4 NaN 19.86 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "data.head()" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -64,15 +243,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Name 0\n", + "Job Titles 0\n", + "Department 0\n", + "Full or Part-Time 0\n", + "Salary or Hourly 0\n", + "Typical Hours 25161\n", + "Annual Salary 8022\n", + "Hourly Rate 25161\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "data.isna().sum()" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -81,15 +281,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Salary or Hourly\n", + "Salary 25161\n", + "Hourly 8022\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "data[\"Salary or Hourly\"].value_counts()" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -97,6 +313,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -105,15 +322,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Department\n", + "POLICE 13414\n", + "FIRE 4641\n", + "STREETS & SAN 2198\n", + "OEMC 2102\n", + "WATER MGMNT 1879\n", + "AVIATION 1629\n", + "TRANSPORTN 1140\n", + "PUBLIC LIBRARY 1015\n", + "GENERAL SERVICES 980\n", + "FAMILY & SUPPORT 615\n", + "FINANCE 560\n", + "HEALTH 488\n", + "CITY COUNCIL 411\n", + "LAW 407\n", + "BUILDINGS 269\n", + "COMMUNITY DEVELOPMENT 207\n", + "BUSINESS AFFAIRS 171\n", + "COPA 116\n", + "BOARD OF ELECTION 107\n", + "DoIT 99\n", + "PROCUREMENT 92\n", + "INSPECTOR GEN 87\n", + "MAYOR'S OFFICE 85\n", + "CITY CLERK 84\n", + "ANIMAL CONTRL 81\n", + "HUMAN RESOURCES 79\n", + "CULTURAL AFFAIRS 65\n", + "BUDGET & MGMT 46\n", + "ADMIN HEARNG 39\n", + "DISABILITIES 28\n", + "TREASURER 22\n", + "HUMAN RELATIONS 16\n", + "BOARD OF ETHICS 8\n", + "POLICE BOARD 2\n", + "LICENSE APPL COMM 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "data[\"Department\"].value_counts()" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -124,15 +390,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Population: 8022\n", + "Hypothesis: 30\n", + "Confidence interval tail: two-sided\n", + "Significance level: 0.05\n", + "Sample Population: 8022\n", + "\n", + "Sample Mean: 32.78855771628024\n", + "Sample STD: 12.112572684276799\n", + "Statistic: 20.6198057854942\n", + "P-value: 4.3230240486229894e-92\n", + "df: 8021\n", + "\n", + "We can reject the null hypothesis\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "import scipy.stats as st\n", + "\n", + "# Hypothesis\n", + "mu = 30 # H0: mu = 30\n", + "cit = \"two-sided\" # H1: mu != 30\n", + "\n", + "# Significance level\n", + "alpha = 0.05\n", + "\n", + "# Sample\n", + "hourly_wages = data[data[\"Salary or Hourly\"] == \"Hourly\"][\"Hourly Rate\"]\n", + "n = 8022\n", + "sample = hourly_wages.sample(n)\n", + "\n", + "# T-test\n", + "t_test_result = st.ttest_1samp(sample, mu, alternative=cit)\n", + "\n", + "# Decision\n", + "if t_test_result.pvalue < alpha:\n", + " decision = \"We can reject the null hypothesis\"\n", + "else:\n", + " decision = \"We cannot reject the null hypothesis\"\n", + "\n", + "# Results\n", + "print(\"Population:\", hourly_wages.count())\n", + "print(\"Hypothesis:\", mu)\n", + "print(\"Confidence interval tail:\", cit)\n", + "print(\"Significance level:\", alpha)\n", + "print(\"Sample Population:\", n)\n", + "print()\n", + "print(\"Sample Mean:\", sample.mean())\n", + "print(\"Sample STD:\", sample.std(ddof=1))\n", + "print(\"Statistic:\", t_test_result.statistic)\n", + "print(\"P-value:\", t_test_result.pvalue)\n", + "print(\"df:\", t_test_result.df)\n", + "print()\n", + "print(decision)\n" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -143,15 +467,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Population: 13404\n", + "Hypothesis (last year's mean): 86000\n", + "Confidence interval tail: greater\n", + "Significance level: 0.05\n", + "Sample Population: 13404\n", + "\n", + "Sample Mean: 86486.41450313339\n", + "Sample STD: 18272.22829399308\n", + "T-statistic: 3.0819970057129935\n", + "P-value: 0.0005150850887741289\n", + "Degrees of freedom: 13403\n", + "\n", + "We can reject the null hypothesis\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "# Population: all salaried employees in the police force\n", + "population = data[(data[\"Department\"] == \"POLICE\") & (data[\"Salary or Hourly\"] == \"Salary\")]\n", + "p = population[\"Annual Salary\"].count()\n", + "\n", + "# Hypothesis\n", + "mu_last_year = 86000 # Last year's mean salary\n", + "alternative = \"greater\" # We are testing if this year's data are greater\n", + "\n", + "# Significance level\n", + "alpha = 0.05\n", + "\n", + "# Sample\n", + "n = 13404\n", + "sample = population[\"Annual Salary\"].sample(n)\n", + "\n", + "# T-test\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu_last_year, alternative=alternative)\n", + "\n", + "# Decision\n", + "if p_value / 2 < alpha and t_statistic > 0:\n", + " decision = \"We can reject the null hypothesis\"\n", + "else:\n", + " decision = \"We cannot reject the null hypothesis\"\n", + "\n", + "# Results\n", + "print(\"Population:\", p)\n", + "print(\"Hypothesis (last year's mean):\", mu_last_year)\n", + "print(\"Confidence interval tail:\", alternative)\n", + "print(\"Significance level:\", alpha)\n", + "print(\"Sample Population:\", n)\n", + "print()\n", + "print(\"Sample Mean:\", sample.mean())\n", + "print(\"Sample STD:\", sample.std(ddof=1))\n", + "print(\"T-statistic:\", t_statistic)\n", + "print(\"P-value:\", p_value / 2) # Dividing p-value by 2 for one-tailed test\n", + "print(\"Degrees of freedom:\", n - 1)\n", + "print()\n", + "print(decision)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -160,15 +543,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Department with the most hourly workers: STREETS & SAN\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "# Cross-tabulation of Department and Salary or Hourly columns\n", + "cross_tab = pd.crosstab(data[\"Department\"], data[\"Salary or Hourly\"])\n", + "\n", + "# Department with the most hourly workers\n", + "department_with_most_hourly = cross_tab[\"Hourly\"].idxmax()\n", + "\n", + "print(\"Department with the most hourly workers:\", department_with_most_hourly)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -177,15 +576,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Department with the most hourly workers: STREETS & SAN\n", + "Hypothesis (hourly wage threshold): 35\n", + "Confidence interval tail: less\n", + "Significance level: 0.05\n", + "Sample Population: 1862\n", + "\n", + "Sample Mean: 33.72837808807734\n", + "Sample STD: 5.735241841459235\n", + "T-statistic: -9.567447887848152\n", + "P-value: 1.6689265282353859e-21\n", + "Degrees of freedom: 1861\n", + "\n", + "We can reject the null hypothesis\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "# Select the data for the department with the most hourly workers\n", + "department_with_most_hourly = cross_tab[\"Hourly\"].idxmax()\n", + "department_data = data[(data[\"Department\"] == department_with_most_hourly) & (data[\"Salary or Hourly\"] == \"Hourly\")]\n", + "\n", + "# Hypothesis\n", + "mu = 35 # Hourly wage threshold\n", + "alternative = \"less\" # We are testing if the hourly wage is less than the threshold\n", + "\n", + "# Significance level\n", + "alpha = 0.05\n", + "\n", + "# Sample\n", + "n = department_data.shape[0]\n", + "sample = department_data[\"Hourly Rate\"]\n", + "\n", + "# T-test\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu, alternative=alternative)\n", + "\n", + "# Decision\n", + "if p_value < alpha and t_statistic < 0:\n", + " decision = \"We can reject the null hypothesis\"\n", + "else:\n", + " decision = \"We cannot reject the null hypothesis\"\n", + "\n", + "# Results\n", + "print(\"Department with the most hourly workers:\", department_with_most_hourly)\n", + "print(\"Hypothesis (hourly wage threshold):\", mu)\n", + "print(\"Confidence interval tail:\", alternative)\n", + "print(\"Significance level:\", alpha)\n", + "print(\"Sample Population:\", n)\n", + "print()\n", + "print(\"Sample Mean:\", sample.mean())\n", + "print(\"Sample STD:\", sample.std(ddof=1))\n", + "print(\"T-statistic:\", t_statistic)\n", + "print(\"P-value:\", p_value)\n", + "print(\"Degrees of freedom:\", n - 1)\n", + "print()\n", + "print(decision)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -206,15 +664,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Mean Hourly Wage:\n", + "Lower bound: 32.52345834488425\n", + "Upper bound: 33.05365708767623\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "# Data for hourly workers\n", + "hourly_data = data[data[\"Salary or Hourly\"] == \"Hourly\"]\n", + "sample = hourly_data[\"Hourly Rate\"]\n", + "\n", + "# Calculate\n", + "confidence_level = 0.95\n", + "degrees_of_freedom = sample.shape[0] - 1\n", + "mean = sample.mean()\n", + "standard_error = st.sem(sample)\n", + "\n", + "# Compute\n", + "confidence_interval = st.t.interval(confidence_level, degrees_of_freedom, mean, standard_error)\n", + "\n", + "# Results\n", + "print(\"95% Confidence Interval for Mean Hourly Wage:\")\n", + "print(\"Lower bound:\", confidence_interval[0])\n", + "print(\"Upper bound:\", confidence_interval[1])" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -223,15 +709,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Mean Salary of Salaried Employees in the Police Department:\n", + "Lower bound: 86177.05631531784\n", + "Upper bound: 86795.77269094894\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "# Select the data for salaried employees in the police department\n", + "police_salary_data = data[(data[\"Department\"] == \"POLICE\") & (data[\"Salary or Hourly\"] == \"Salary\")]\n", + "sample = police_salary_data[\"Annual Salary\"]\n", + "\n", + "# Calculate the necessary parameters\n", + "confidence_level = 0.95\n", + "degrees_of_freedom = sample.shape[0] - 1\n", + "mean = sample.mean()\n", + "standard_error = st.sem(sample)\n", + "\n", + "# Compute the confidence interval\n", + "confidence_interval = st.t.interval(confidence_level, degrees_of_freedom, mean, standard_error)\n", + "\n", + "print(\"95% Confidence Interval for Mean Salary of Salaried Employees in the Police Department:\")\n", + "print(\"Lower bound:\", confidence_interval[0])\n", + "print(\"Upper bound:\", confidence_interval[1])" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -243,16 +756,6 @@ "\n", "In the cell below, use the `proportions_ztest` function from `statsmodels` to perform a hypothesis test that will determine whether the number of hourly workers in the City of Chicago is significantly different from 25% at the 95% confidence level." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n", - "\n" - ] } ], "metadata": { @@ -271,7 +774,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.11.3" } }, "nbformat": 4,