From c9b1fc3de60662b8e9b35b05e1281f0be4f64d3c Mon Sep 17 00:00:00 2001 From: AnaCarvalho84 <131803922+AnaCarvalho84@users.noreply.github.com> Date: Tue, 15 Aug 2023 09:17:12 +0100 Subject: [PATCH 1/3] lab done --- main.ipynb | 821 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 821 insertions(+) create mode 100644 main.ipynb diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..a9460c2 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,821 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before your start:\n", + "- Read the README.md file\n", + "- Comment as much as you can and use the resources (README.md file)\n", + "- Happy learning!" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# import numpy and pandas\n", + "import pandas as pd\n", + "import numpy as np\n", + "from scipy.stats import ttest_1samp\n", + "import scipy.stats as st\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 1 - Exploring the Data\n", + "\n", + "In this challenge, we will examine all salaries of employees of the City of Chicago. We will start by loading the dataset and examining its contents." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "chicago = pd.read_csv(\"Current_Employee_Names__Salaries__and_Position_Titles.csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Examine the `salaries` dataset using the `head` function below." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0AARON, JEFFERY MSERGEANTPOLICEFSalaryNaN101442.0NaN
1AARON, KARINAPOLICE OFFICER (ASSIGNED AS DETECTIVE)POLICEFSalaryNaN94122.0NaN
2AARON, KIMBERLEI RCHIEF CONTRACT EXPEDITERGENERAL SERVICESFSalaryNaN101592.0NaN
3ABAD JR, VICENTE MCIVIL ENGINEER IVWATER MGMNTFSalaryNaN110064.0NaN
4ABASCAL, REECE ETRAFFIC CONTROL AIDE-HOURLYOEMCPHourly20.0NaN19.86
\n", + "
" + ], + "text/plain": [ + " Name Job Titles \\\n", + "0 AARON, JEFFERY M SERGEANT \n", + "1 AARON, KARINA POLICE OFFICER (ASSIGNED AS DETECTIVE) \n", + "2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", + "3 ABAD JR, VICENTE M CIVIL ENGINEER IV \n", + "4 ABASCAL, REECE E TRAFFIC CONTROL AIDE-HOURLY \n", + "\n", + " Department Full or Part-Time Salary or Hourly Typical Hours \\\n", + "0 POLICE F Salary NaN \n", + "1 POLICE F Salary NaN \n", + "2 GENERAL SERVICES F Salary NaN \n", + "3 WATER MGMNT F Salary NaN \n", + "4 OEMC P Hourly 20.0 \n", + "\n", + " Annual Salary Hourly Rate \n", + "0 101442.0 NaN \n", + "1 94122.0 NaN \n", + "2 101592.0 NaN \n", + "3 110064.0 NaN \n", + "4 NaN 19.86 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chicago.head()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see from looking at the `head` function that there is quite a bit of missing data. Let's examine how much missing data is in each column. Produce this output in the cell below" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Name 0\n", + "Job Titles 0\n", + "Department 0\n", + "Full or Part-Time 0\n", + "Salary or Hourly 0\n", + "Typical Hours 25161\n", + "Annual Salary 8022\n", + "Hourly Rate 25161\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chicago_null = chicago.isnull().sum()\n", + "chicago_null\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also look at the count of hourly vs. salaried employees. Write the code in the cell below" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Salary 25161\n", + "Hourly 8022\n", + "Name: Salary or Hourly, dtype: int64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chicago['Salary or Hourly'].value_counts()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What this information indicates is that the table contains information about two types of employees - salaried and hourly. Some columns apply only to one type of employee while other columns only apply to another kind. This is why there are so many missing values. Therefore, we will not do anything to handle the missing values." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are different departments in the city. List all departments and the count of employees in each department." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Name
Department
ADMIN HEARNG39
ANIMAL CONTRL81
AVIATION1629
BOARD OF ELECTION107
BOARD OF ETHICS8
BUDGET & MGMT46
BUILDINGS269
BUSINESS AFFAIRS171
CITY CLERK84
CITY COUNCIL411
COMMUNITY DEVELOPMENT207
COPA116
CULTURAL AFFAIRS65
DISABILITIES28
DoIT99
FAMILY & SUPPORT615
FINANCE560
FIRE4641
GENERAL SERVICES980
HEALTH488
HUMAN RELATIONS16
HUMAN RESOURCES79
INSPECTOR GEN87
LAW407
LICENSE APPL COMM1
MAYOR'S OFFICE85
OEMC2102
POLICE13414
POLICE BOARD2
PROCUREMENT92
PUBLIC LIBRARY1015
STREETS & SAN2198
TRANSPORTN1140
TREASURER22
WATER MGMNT1879
\n", + "
" + ], + "text/plain": [ + " Name\n", + "Department \n", + "ADMIN HEARNG 39\n", + "ANIMAL CONTRL 81\n", + "AVIATION 1629\n", + "BOARD OF ELECTION 107\n", + "BOARD OF ETHICS 8\n", + "BUDGET & MGMT 46\n", + "BUILDINGS 269\n", + "BUSINESS AFFAIRS 171\n", + "CITY CLERK 84\n", + "CITY COUNCIL 411\n", + "COMMUNITY DEVELOPMENT 207\n", + "COPA 116\n", + "CULTURAL AFFAIRS 65\n", + "DISABILITIES 28\n", + "DoIT 99\n", + "FAMILY & SUPPORT 615\n", + "FINANCE 560\n", + "FIRE 4641\n", + "GENERAL SERVICES 980\n", + "HEALTH 488\n", + "HUMAN RELATIONS 16\n", + "HUMAN RESOURCES 79\n", + "INSPECTOR GEN 87\n", + "LAW 407\n", + "LICENSE APPL COMM 1\n", + "MAYOR'S OFFICE 85\n", + "OEMC 2102\n", + "POLICE 13414\n", + "POLICE BOARD 2\n", + "PROCUREMENT 92\n", + "PUBLIC LIBRARY 1015\n", + "STREETS & SAN 2198\n", + "TRANSPORTN 1140\n", + "TREASURER 22\n", + "WATER MGMNT 1879" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "chicago.groupby(\"Department\").agg({\"Name\": \"count\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 2 - Hypothesis Tests\n", + "\n", + "In this section of the lab, we will test whether the hourly wage of all hourly workers is significantly different from $30/hr. Import the correct one sample test function from scipy and perform the hypothesis test for a 95% two sided confidence interval." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TtestResult(statistic=20.6198057854942, pvalue=4.3230240486229894e-92, df=8021)\n", + "Reject the null hypothesis: The average hourly wage is significantly different from $30/hr.\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "#H0 = The average hourly wage of hourly workers is equal to $30/hour.\n", + "#H1 = The average hourly wage of hourly workers is different from $30/hour.\n", + "\n", + "# 2. Significance level\n", + "alpha = 0.05\n", + "mu = 30\n", + "\n", + "# 3. Sample\n", + "sample = chicago[chicago[\"Salary or Hourly\"]== \"Hourly\"][\"Hourly Rate\"]\n", + "\n", + "# 4. Compute statistics / 5. Get p-value\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu)\n", + "print(st.ttest_1samp(sample, mu))\n", + "\n", + "if p_value < alpha:\n", + " print(\"Reject the null hypothesis: The average hourly wage is significantly different from $30/hr.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: The average hourly wage is not significantly different from $30/hr.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are also curious about salaries in the police force. The chief of police in Chicago claimed in a press briefing that salaries this year are higher than last year's mean of $86000/year a year for all salaried employees. Test this one sided hypothesis using a 95% confidence interval.\n", + "\n", + "Hint: A one tailed test has a p-value that is half of the two tailed p-value. If our hypothesis is greater than, then to reject, the test statistic must also be positive." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 5.932870515690814\n", + "P-value: 0.9999999984921207\n", + "We can not reject the null hypothesis\n" + ] + } + ], + "source": [ + "#Hypothesis\n", + "#H0 = salaries >= $86000/year for all salaried employees\n", + "#H1 = salaries < $86000\n", + "\n", + "#Significance\n", + "alpha = 0.05\n", + "\n", + "#Sample\n", + "sample = chicago[chicago[\"Salary or Hourly\"] == \"Salary\"]\n", + "mu = 86000\n", + "annual_salary = sample[\"Annual Salary\"]\n", + "\n", + "#Compute stastic\n", + "t_statistic, p_value = st.ttest_1samp(annual_salary, mu, alternative='less')\n", + "\n", + "print(f\"T-statistic: {t_statistic}\")\n", + "print(f\"P-value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the null hypothesis\")\n", + "else:\n", + " print(\"We can not reject the null hypothesis\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the `crosstab` function, find the department that has the most hourly workers. " + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Department with the most hourly workers: STREETS & SAN\n", + "Number of hourly workers in STREETS & SAN: 1862\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "cross_tab = pd.crosstab(chicago['Department'], chicago['Salary or Hourly'])\n", + "\n", + "department_with_most_hourly_workers = cross_tab['Hourly'].idxmax()\n", + "\n", + "hourly_workers_in_streets_san = cross_tab.loc['STREETS & SAN', 'Hourly']\n", + "\n", + "print(\"Department with the most hourly workers:\", department_with_most_hourly_workers)\n", + "print(\"Number of hourly workers in STREETS & SAN:\", hourly_workers_in_streets_san)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The workers from the department with the most hourly workers have complained that their hourly wage is less than $35/hour. Using a one sample t-test, test this one-sided hypothesis at the 95% confidence level." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: -5.096278326234201\n", + "P-value: 0.9999997626711307\n", + "We cannot reject the null hypothesis\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "# Hypothesis \n", + "#H0 = Workers from Streets & San hourly wage <= 35$\n", + "# H1 = Workers from Streets & San hourly wage > 35$\n", + "\n", + "#Significance\n", + "alpha = 0.05\n", + "\n", + "#Sample\n", + "#on my sample i choose 560 from my population 1862 Streets & San (about 30% of the hourly workers)\n", + "sample = chicago[(chicago[\"Department\"]==\"STREETS & SAN\") & (chicago[\"Salary or Hourly\"] ==\"Hourly\")][\"Hourly Rate\"].sample(560)\n", + "mu = 35\n", + "\n", + "#stats\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu, alternative=\"greater\")\n", + "\n", + "print(f\"T-statistic: {t_statistic}\")\n", + "print(f\"P-value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the null hypothesis\")\n", + "else:\n", + " print(\"We cannot reject the null hypothesis\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 3: To practice - Constructing Confidence Intervals\n", + "\n", + "While testing our hypothesis is a great way to gather empirical evidence for accepting or rejecting the hypothesis, another way to gather evidence is by creating a confidence interval. A confidence interval gives us information about the true mean of the population. So for a 95% confidence interval, we are 95% sure that the mean of the population is within the confidence interval. \n", + ").\n", + "\n", + "To read more about confidence intervals, click [here](https://en.wikipedia.org/wiki/Confidence_interval).\n", + "\n", + "\n", + "In the cell below, we will construct a 95% confidence interval for the mean hourly wage of all hourly workers. \n", + "\n", + "The confidence interval is computed in SciPy using the `t.interval` function. You can read more about this function [here](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html).\n", + "\n", + "To compute the confidence interval of the hourly wage, use the 0.95 for the confidence level, number of rows - 1 for degrees of freedom, the mean of the sample for the location parameter and the standard error for the scale. The standard error can be computed using [this](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.sem.html) function in SciPy." + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Mean Hourly Wage: (32.52345834488425, 33.05365708767623)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "#sample\n", + "hourly_workers = chicago[chicago['Salary or Hourly'] == 'Hourly']\n", + "\n", + "#Mean and std error\n", + "mean_hourly_wage = hourly_workers['Hourly Rate'].mean()\n", + "standard_error = st.sem(hourly_workers['Hourly Rate'])\n", + "\n", + "#confidence level\n", + "confidence_level = 0.95\n", + "\n", + "#degrees of freedom\n", + "degrees_of_freedom = len(hourly_workers) - 1\n", + "\n", + "#confidence interval using t.interval\n", + "confidence_interval = st.t.interval(confidence_level, degrees_of_freedom, loc=mean_hourly_wage, scale=standard_error)\n", + "\n", + "print(f\"95% Confidence Interval for Mean Hourly Wage: {confidence_interval}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***My insights***\n", + "I'm quite confident that the true average hourly wage of all hourly workers lies between approximately $32.52 and $33.05 with a 95% confidence level. This provides me an idea of the likely range within which the true average lies based on the available sample." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now construct the 95% confidence interval for all salaried employeed in the police in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Annual Salary employees in Police: (86476.5176546444, 86496.31135162238)\n" + ] + } + ], + "source": [ + "#sample\n", + "salaried_employeed = chicago[(chicago[\"Department\"]==\"POLICE\") & (chicago[\"Salary or Hourly\"] ==\"Salary\")][\"Annual Salary\"]\n", + "\n", + "#confidance level\n", + "confidance_level = 0.95\n", + "alpha = 1 - confidance_level\n", + "\n", + "#degrees of freedom\n", + "ddof = len(salaried_employeed) - 1\n", + "\n", + "#compute mean and std error\n", + "mean = salaried_employeed.mean()\n", + "standard_error = st.sem(salaried_employeed)\n", + "\n", + "# Calculate the confidence interval using t.interval\n", + "confidence_interval = st.t.interval(alpha, df=ddof, loc=mean, scale=standard_error)\n", + "\n", + "print(f\"95% Confidence Interval for Annual Salary employees in Police: {confidence_interval}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***My insights***\n", + "I'm quite confident that the true average annual salary of employees in the \"POLICE\" department lies between approximately $86476.52 and $86496.31 with a 95% confidence level. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bonus Challenge - Hypothesis Tests of Proportions\n", + "\n", + "Another type of one sample test is a hypothesis test of proportions. In this test, we examine whether the proportion of a group in our sample is significantly different than a fraction. \n", + "\n", + "You can read more about one sample proportion tests [here](http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/SAS/SAS6-CategoricalData/SAS6-CategoricalData2.html).\n", + "\n", + "In the cell below, use the `proportions_ztest` function from `statsmodels` to perform a hypothesis test that will determine whether the number of hourly workers in the City of Chicago is significantly different from 25% at the 95% confidence level." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here:\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d9cd6c9cb6253b9007dc48dd81c7f0f488eab201 Mon Sep 17 00:00:00 2001 From: AnaCarvalho84 <131803922+AnaCarvalho84@users.noreply.github.com> Date: Tue, 15 Aug 2023 09:22:11 +0100 Subject: [PATCH 2/3] lab done --- your-code/main.ipynb | 622 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 582 insertions(+), 40 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 59b955a..a9460c2 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,12 +12,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "# import numpy and pandas\n", - "\n" + "import pandas as pd\n", + "import numpy as np\n", + "from scipy.stats import ttest_1samp\n", + "import scipy.stats as st\n", + "import matplotlib.pyplot as plt" ] }, { @@ -31,11 +35,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "chicago = pd.read_csv(\"Current_Employee_Names__Salaries__and_Position_Titles.csv\")\n" ] }, { @@ -47,11 +51,130 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameJob TitlesDepartmentFull or Part-TimeSalary or HourlyTypical HoursAnnual SalaryHourly Rate
0AARON, JEFFERY MSERGEANTPOLICEFSalaryNaN101442.0NaN
1AARON, KARINAPOLICE OFFICER (ASSIGNED AS DETECTIVE)POLICEFSalaryNaN94122.0NaN
2AARON, KIMBERLEI RCHIEF CONTRACT EXPEDITERGENERAL SERVICESFSalaryNaN101592.0NaN
3ABAD JR, VICENTE MCIVIL ENGINEER IVWATER MGMNTFSalaryNaN110064.0NaN
4ABASCAL, REECE ETRAFFIC CONTROL AIDE-HOURLYOEMCPHourly20.0NaN19.86
\n", + "
" + ], + "text/plain": [ + " Name Job Titles \\\n", + "0 AARON, JEFFERY M SERGEANT \n", + "1 AARON, KARINA POLICE OFFICER (ASSIGNED AS DETECTIVE) \n", + "2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", + "3 ABAD JR, VICENTE M CIVIL ENGINEER IV \n", + "4 ABASCAL, REECE E TRAFFIC CONTROL AIDE-HOURLY \n", + "\n", + " Department Full or Part-Time Salary or Hourly Typical Hours \\\n", + "0 POLICE F Salary NaN \n", + "1 POLICE F Salary NaN \n", + "2 GENERAL SERVICES F Salary NaN \n", + "3 WATER MGMNT F Salary NaN \n", + "4 OEMC P Hourly 20.0 \n", + "\n", + " Annual Salary Hourly Rate \n", + "0 101442.0 NaN \n", + "1 94122.0 NaN \n", + "2 101592.0 NaN \n", + "3 110064.0 NaN \n", + "4 NaN 19.86 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", + "chicago.head()\n", "\n" ] }, @@ -64,11 +187,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Name 0\n", + "Job Titles 0\n", + "Department 0\n", + "Full or Part-Time 0\n", + "Salary or Hourly 0\n", + "Typical Hours 25161\n", + "Annual Salary 8022\n", + "Hourly Rate 25161\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", + "chicago_null = chicago.isnull().sum()\n", + "chicago_null\n", "\n" ] }, @@ -81,12 +224,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Salary 25161\n", + "Hourly 8022\n", + "Name: Salary or Hourly, dtype: int64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n", - "\n" + "chicago['Salary or Hourly'].value_counts()\n" ] }, { @@ -105,12 +260,230 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Name
Department
ADMIN HEARNG39
ANIMAL CONTRL81
AVIATION1629
BOARD OF ELECTION107
BOARD OF ETHICS8
BUDGET & MGMT46
BUILDINGS269
BUSINESS AFFAIRS171
CITY CLERK84
CITY COUNCIL411
COMMUNITY DEVELOPMENT207
COPA116
CULTURAL AFFAIRS65
DISABILITIES28
DoIT99
FAMILY & SUPPORT615
FINANCE560
FIRE4641
GENERAL SERVICES980
HEALTH488
HUMAN RELATIONS16
HUMAN RESOURCES79
INSPECTOR GEN87
LAW407
LICENSE APPL COMM1
MAYOR'S OFFICE85
OEMC2102
POLICE13414
POLICE BOARD2
PROCUREMENT92
PUBLIC LIBRARY1015
STREETS & SAN2198
TRANSPORTN1140
TREASURER22
WATER MGMNT1879
\n", + "
" + ], + "text/plain": [ + " Name\n", + "Department \n", + "ADMIN HEARNG 39\n", + "ANIMAL CONTRL 81\n", + "AVIATION 1629\n", + "BOARD OF ELECTION 107\n", + "BOARD OF ETHICS 8\n", + "BUDGET & MGMT 46\n", + "BUILDINGS 269\n", + "BUSINESS AFFAIRS 171\n", + "CITY CLERK 84\n", + "CITY COUNCIL 411\n", + "COMMUNITY DEVELOPMENT 207\n", + "COPA 116\n", + "CULTURAL AFFAIRS 65\n", + "DISABILITIES 28\n", + "DoIT 99\n", + "FAMILY & SUPPORT 615\n", + "FINANCE 560\n", + "FIRE 4641\n", + "GENERAL SERVICES 980\n", + "HEALTH 488\n", + "HUMAN RELATIONS 16\n", + "HUMAN RESOURCES 79\n", + "INSPECTOR GEN 87\n", + "LAW 407\n", + "LICENSE APPL COMM 1\n", + "MAYOR'S OFFICE 85\n", + "OEMC 2102\n", + "POLICE 13414\n", + "POLICE BOARD 2\n", + "PROCUREMENT 92\n", + "PUBLIC LIBRARY 1015\n", + "STREETS & SAN 2198\n", + "TRANSPORTN 1140\n", + "TREASURER 22\n", + "WATER MGMNT 1879" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "\n" + "chicago.groupby(\"Department\").agg({\"Name\": \"count\"})" ] }, { @@ -124,12 +497,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TtestResult(statistic=20.6198057854942, pvalue=4.3230240486229894e-92, df=8021)\n", + "Reject the null hypothesis: The average hourly wage is significantly different from $30/hr.\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "#H0 = The average hourly wage of hourly workers is equal to $30/hour.\n", + "#H1 = The average hourly wage of hourly workers is different from $30/hour.\n", + "\n", + "# 2. Significance level\n", + "alpha = 0.05\n", + "mu = 30\n", + "\n", + "# 3. Sample\n", + "sample = chicago[chicago[\"Salary or Hourly\"]== \"Hourly\"][\"Hourly Rate\"]\n", + "\n", + "# 4. Compute statistics / 5. Get p-value\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu)\n", + "print(st.ttest_1samp(sample, mu))\n", + "\n", + "if p_value < alpha:\n", + " print(\"Reject the null hypothesis: The average hourly wage is significantly different from $30/hr.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: The average hourly wage is not significantly different from $30/hr.\")" ] }, { @@ -143,12 +542,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 5.932870515690814\n", + "P-value: 0.9999999984921207\n", + "We can not reject the null hypothesis\n" + ] + } + ], "source": [ - "# Your code here:\n", - "\n" + "#Hypothesis\n", + "#H0 = salaries >= $86000/year for all salaried employees\n", + "#H1 = salaries < $86000\n", + "\n", + "#Significance\n", + "alpha = 0.05\n", + "\n", + "#Sample\n", + "sample = chicago[chicago[\"Salary or Hourly\"] == \"Salary\"]\n", + "mu = 86000\n", + "annual_salary = sample[\"Annual Salary\"]\n", + "\n", + "#Compute stastic\n", + "t_statistic, p_value = st.ttest_1samp(annual_salary, mu, alternative='less')\n", + "\n", + "print(f\"T-statistic: {t_statistic}\")\n", + "print(f\"P-value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the null hypothesis\")\n", + "else:\n", + " print(\"We can not reject the null hypothesis\")" ] }, { @@ -160,12 +589,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 92, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Department with the most hourly workers: STREETS & SAN\n", + "Number of hourly workers in STREETS & SAN: 1862\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "cross_tab = pd.crosstab(chicago['Department'], chicago['Salary or Hourly'])\n", + "\n", + "department_with_most_hourly_workers = cross_tab['Hourly'].idxmax()\n", + "\n", + "hourly_workers_in_streets_san = cross_tab.loc['STREETS & SAN', 'Hourly']\n", + "\n", + "print(\"Department with the most hourly workers:\", department_with_most_hourly_workers)\n", + "print(\"Number of hourly workers in STREETS & SAN:\", hourly_workers_in_streets_san)\n" ] }, { @@ -177,12 +623,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: -5.096278326234201\n", + "P-value: 0.9999997626711307\n", + "We cannot reject the null hypothesis\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "# Hypothesis \n", + "#H0 = Workers from Streets & San hourly wage <= 35$\n", + "# H1 = Workers from Streets & San hourly wage > 35$\n", + "\n", + "#Significance\n", + "alpha = 0.05\n", + "\n", + "#Sample\n", + "#on my sample i choose 560 from my population 1862 Streets & San (about 30% of the hourly workers)\n", + "sample = chicago[(chicago[\"Department\"]==\"STREETS & SAN\") & (chicago[\"Salary or Hourly\"] ==\"Hourly\")][\"Hourly Rate\"].sample(560)\n", + "mu = 35\n", + "\n", + "#stats\n", + "t_statistic, p_value = st.ttest_1samp(sample, mu, alternative=\"greater\")\n", + "\n", + "print(f\"T-statistic: {t_statistic}\")\n", + "print(f\"P-value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the null hypothesis\")\n", + "else:\n", + " print(\"We cannot reject the null hypothesis\")" ] }, { @@ -206,12 +683,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 93, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Mean Hourly Wage: (32.52345834488425, 33.05365708767623)\n" + ] + } + ], "source": [ "# Your code here:\n", - "\n" + "\n", + "#sample\n", + "hourly_workers = chicago[chicago['Salary or Hourly'] == 'Hourly']\n", + "\n", + "#Mean and std error\n", + "mean_hourly_wage = hourly_workers['Hourly Rate'].mean()\n", + "standard_error = st.sem(hourly_workers['Hourly Rate'])\n", + "\n", + "#confidence level\n", + "confidence_level = 0.95\n", + "\n", + "#degrees of freedom\n", + "degrees_of_freedom = len(hourly_workers) - 1\n", + "\n", + "#confidence interval using t.interval\n", + "confidence_interval = st.t.interval(confidence_level, degrees_of_freedom, loc=mean_hourly_wage, scale=standard_error)\n", + "\n", + "print(f\"95% Confidence Interval for Mean Hourly Wage: {confidence_interval}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***My insights***\n", + "I'm quite confident that the true average hourly wage of all hourly workers lies between approximately $32.52 and $33.05 with a 95% confidence level. This provides me an idea of the likely range within which the true average lies based on the available sample." ] }, { @@ -223,12 +733,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95% Confidence Interval for Annual Salary employees in Police: (86476.5176546444, 86496.31135162238)\n" + ] + } + ], "source": [ - "# Your code here:\n", - "\n" + "#sample\n", + "salaried_employeed = chicago[(chicago[\"Department\"]==\"POLICE\") & (chicago[\"Salary or Hourly\"] ==\"Salary\")][\"Annual Salary\"]\n", + "\n", + "#confidance level\n", + "confidance_level = 0.95\n", + "alpha = 1 - confidance_level\n", + "\n", + "#degrees of freedom\n", + "ddof = len(salaried_employeed) - 1\n", + "\n", + "#compute mean and std error\n", + "mean = salaried_employeed.mean()\n", + "standard_error = st.sem(salaried_employeed)\n", + "\n", + "# Calculate the confidence interval using t.interval\n", + "confidence_interval = st.t.interval(alpha, df=ddof, loc=mean, scale=standard_error)\n", + "\n", + "print(f\"95% Confidence Interval for Annual Salary employees in Police: {confidence_interval}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***My insights***\n", + "I'm quite confident that the true average annual salary of employees in the \"POLICE\" department lies between approximately $86476.52 and $86496.31 with a 95% confidence level. " ] }, { @@ -246,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -257,7 +799,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -271,7 +813,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.9" } }, "nbformat": 4, From 9f0824cbb7db750f34e88de63be126da586f768f Mon Sep 17 00:00:00 2001 From: AnaCarvalho84 <131803922+AnaCarvalho84@users.noreply.github.com> Date: Sat, 19 Aug 2023 09:20:40 +0100 Subject: [PATCH 3/3] Lab done --- main.ipynb | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/main.ipynb b/main.ipynb index a9460c2..56e93ad 100644 --- a/main.ipynb +++ b/main.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,9 +35,28 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'Current_Employee_Names__Salaries__and_Position_Titles.csv'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m chicago \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39mCurrent_Employee_Names__Salaries__and_Position_Titles.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m 1736\u001b[0m f,\n\u001b[0;32m 1737\u001b[0m mode,\n\u001b[0;32m 1738\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1739\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1740\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m 1741\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m 1742\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m 1743\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1744\u001b[0m )\n\u001b[0;32m 1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[0;32m 857\u001b[0m handle,\n\u001b[0;32m 858\u001b[0m ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[0;32m 859\u001b[0m encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[0;32m 860\u001b[0m errors\u001b[39m=\u001b[39;49merrors,\n\u001b[0;32m 861\u001b[0m newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 862\u001b[0m )\n\u001b[0;32m 863\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Current_Employee_Names__Salaries__and_Position_Titles.csv'" + ] + } + ], "source": [ "chicago = pd.read_csv(\"Current_Employee_Names__Salaries__and_Position_Titles.csv\")\n" ] @@ -51,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -187,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -224,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -260,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -497,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -542,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -589,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -623,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -683,7 +702,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -733,7 +752,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -788,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [