-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathshared.py
More file actions
54 lines (42 loc) · 1.99 KB
/
shared.py
File metadata and controls
54 lines (42 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Shared data loading and processing for earthquake dashboard."""
import os
from pathlib import Path
import kagglehub # type: ignore
import pandas as pd
app_dir = Path(__file__).parent
# Download earthquakes dataset from Kaggle
path = kagglehub.dataset_download("shreyasur965/recent-earthquakes")
csv_file = os.path.join(path, "earthquakes.csv")
earthquakes = pd.read_csv(csv_file)
# --------------------------------------------------------
# Data processing
# --------------------------------------------------------
# Convert time to datetime (time is in milliseconds since epoch)
earthquakes['datetime'] = pd.to_datetime(earthquakes['time'], unit='ms')
# Make new columns for month and season
earthquakes['month'] = earthquakes['datetime'].dt.month
earthquakes['season'] = earthquakes['month'] % 12 // 3 + 1
season_mapping = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'}
earthquakes['season'] = earthquakes['season'].map(season_mapping)
# Categoerize magnitude to small, medium, large in new column
earthquakes['magnitude_category'] = pd.cut(
earthquakes['magnitude'],
bins=[-float('inf'), 4.0, 6.0, float('inf')],
labels=['Small', 'Medium', 'Large'])
# Categorize depth to shallow, intermediate, deep in new column
earthquakes['depth_category'] = pd.cut(
earthquakes['depth'],
bins=[-float('inf'), 70.0, 300.0, float('inf')],
labels=['Shallow', 'Intermediate', 'Deep'])
# Filter out rows with missing values in key columns
earthquakes = earthquakes.dropna(subset=['magnitude', 'depth', 'latitude', 'longitude'])
earthquakes = earthquakes.reset_index(drop=True) # Reset index after filtering
# Delete duplicate rows based on 'id' column
earthquakes = earthquakes.drop_duplicates(subset=['id'])
# Remove unnecessary columns
columns_to_drop = [
"type", "updated", "url", "detailUrl", "status", "code", "sources",
"types", "rms", "geometryType", "placeOnly", "location", "locality",
"postcode", "what3words", "locationDetails"
]
earthquakes = earthquakes.drop(columns=columns_to_drop)