forked from Youtube-NikitaTymoshenko/sql-course
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheda.sql
More file actions
61 lines (52 loc) · 1.11 KB
/
eda.sql
File metadata and controls
61 lines (52 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
SELECT *
FROM salaries
LIMIT 100;
SELECT COUNT(*)
FROM salaries;
SELECT
COUNT(*)
, COUNT(*) - COUNT(salary_in_usd) AS missing_values
FROM salaries;
-- categorical data
SELECT
job_title
, COUNT(*) -- to check not only distribution, but spelling as well
FROM salaries
GROUP BY 1
ORDER BY 2 DESC
-- LIMIT 10 and generate a bar chart
-- numeric data
SELECT
job_title
, exp_level
, min(salary_in_usd)
, max(salary_in_usd)
, ROUND(avg(salary_in_usd),2) AS avg
, ROUND(variance(salary_in_usd),2) As var
, ROUND(stddev(salary_in_usd),2) as stddev
FROM salaries
GROUP BY 1,2
ORDER BY 1,2
-- distribution
SELECT -- not easy to explore
salary_in_usd
, COUNT(*)
FROM salaries
GROUP BY 1;
SELECT -- trancate allows to make 160000 from 154560
TRUNC(salary_in_usd,-1)
, COUNT(*)
FROM salaries
GROUP BY 1;
SELECT
CASE
WHEN salary_in_usd <=10000 THEN 'A'
WHEN salary_in_usd <=50000 THEN 'B'
WHEN salary_in_usd <=100000 THEN 'C'
WHEN salary_in_usd <=200000 THEN 'D'
WHEN salary_in_usd > 200000 THEN 'E' END AS salary_cat
, COUNT(*)
FROM salaries
GROUP BY 1;
SELECT corr(remote_ratio, salary_in_usd)
FROM salaries;