-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexplore_page.py
More file actions
104 lines (84 loc) · 3.51 KB
/
explore_page.py
File metadata and controls
104 lines (84 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
def divide_types(types, cutoff):
categories = {}
categories["Others"] = 0
for index, val in types.items():
if(val>=cutoff):
categories[index] = val
else:
categories["Others"]+=val
return categories
def correct_exp(x):
if x == 'Less than 1 year':
return 0.5
if x == 'More than 50 years':
return 50
else:
return float(x)
def correct_Education(x):
if 'Master’s degree' in x:
return 'Master degree'
if 'Bachelor’s degree' in x:
return 'Bachelor degree'
if 'Professional degree' in x:
return 'Professional degree'
else:
return 'Less than a Bachelor'
@st.cache_data
def load_data():
# Load the dataset from Google Drive
df = pd.read_csv('https://drive.google.com/uc?id=1ebNIs3jPNJpz1jOF2VBusHU2BsUneVU6&export=download')
# Rename the 'ConvertedCompYearly' column to 'Salary'
if 'ConvertedCompYearly' in df.columns:
df = df.rename({'ConvertedCompYearly': 'Salary'}, axis=1)
else:
raise KeyError("The column 'ConvertedCompYearly' is missing in the dataset.")
# Filter rows where 'Salary' is not null
df = df[df['Salary'].notnull()]
# Categorize 'Country' based on a cutoff for value counts
types = df['Country'].value_counts()
cutoff = 400
categories = divide_types(types, cutoff) # Ensure divide_types is defined correctly
df['Country'] = df['Country'].apply(lambda x: x if x in categories else 'Others')
# Select relevant columns
df = df[['Country', 'EdLevel', 'YearsCodePro', 'Employment', 'Salary']]
# Drop rows with any missing values
df.dropna(inplace=True)
# Apply transformations to 'YearsCodePro' and 'EdLevel'
df['YearsCodePro'] = df['YearsCodePro'].apply(correct_exp) # Ensure correct_exp is defined
df['EdLevel'] = df['EdLevel'].apply(correct_Education) # Ensure correct_Education is defined
# Drop the 'Employment' column (correct method to drop in-place)
df = df.drop(['Employment'], axis=1)
return df
# Load data to use
try:
df = load_data()
print("Data loaded successfully!")
except Exception as e:
print(f"Error loading data: {e}")
def show_explore_page(df):
st.title("Explore Software Engineer Salaries")
st.write("### Stack Overflow Developer Survey 2024")
# Check if required columns are present
required_columns = {'Country', 'Salary', 'YearsCodePro'}
if not required_columns.issubset(df.columns):
st.error(f"The dataset must contain the following columns: {', '.join(required_columns)}")
return
# Country-wise data visualization
st.write("#### Number of Responses by Country")
country_data = df['Country'].value_counts()
fig1, ax1 = plt.subplots()
ax1.pie(country_data, labels=country_data.index, autopct="%1.2f%%", shadow=True, startangle=200)
ax1.axis("equal") # Equal aspect ratio ensures the pie is drawn as a circle.
st.pyplot(fig1)
# Mean salary by country
st.write("#### Mean Salary by Country")
salary_by_country = df.groupby('Country')['Salary'].mean().sort_values()
st.bar_chart(salary_by_country, use_container_width=True)
# Mean salary by experience
st.write("#### Mean Salary by Years of Professional Coding Experience")
salary_by_experience = df.groupby('YearsCodePro')['Salary'].mean().sort_values()
st.line_chart(salary_by_experience, use_container_width=True)
st.success("Analysis complete!")