-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathclean_data.py
More file actions
41 lines (31 loc) · 1.33 KB
/
clean_data.py
File metadata and controls
41 lines (31 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# import modules
import pandas as pd
import numpy as np
# load main dataset
globaldata = pd.read_csv('GlobalTemperatures.csv')
# convert date to pandas date time
globaldata.dt = pd.to_datetime(globaldata.dt)
# create new columns for date information
globaldata['year'] = globaldata['dt'].dt.year
globaldata['month'] = globaldata['dt'].dt.month
globaldata['day'] = globaldata['dt'].dt.day
# drop original datetime column
globaldata = globaldata.drop('dt',1)
# take rows with no missing LandAverageTemperature, target variable
globaldata = globaldata[np.isfinite(globaldata['LandAverageTemperature'])]
# fill in missing data
globaldata = globaldata.fillna(-9999)
# drop unnecessary columns
globaldata = globaldata.drop('LandMaxTemperature',1)
globaldata = globaldata.drop('LandMaxTemperatureUncertainty',1)
globaldata = globaldata.drop('LandMinTemperature',1)
globaldata = globaldata.drop('LandMinTemperatureUncertainty',1)
globaldata = globaldata.drop('LandAverageTemperatureUncertainty',1)
globaldata = globaldata.drop('LandAndOceanAverageTemperature',1)
globaldata = globaldata.drop('LandAndOceanAverageTemperatureUncertainty',1)
# rename target variable as class
globaldata.rename(columns={'LandAverageTemperature': 'class'}, inplace=True)
# write processed data to file
globaldata.to_csv('cleaned_globaldata.csv',index=False)
# exit program
exit()