-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmachinelearns6.py
More file actions
308 lines (277 loc) · 13 KB
/
machinelearns6.py
File metadata and controls
308 lines (277 loc) · 13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#created on Jul 11 to run models on server
#machine learning class file
#copied machinelearn class from jupyter lab file
import pandas as pd
import numpy as np
class machinelearn:
def __init__(self, inputdf, feature_names):
self.inputdf = inputdf
self.feature_names = feature_names
def set_maker(self, inputdf, split_type, year, feature_names, ts=0.2, p='n'):
if split_type == 'year':
train = inputdf[inputdf['Year']!=year]
test = inputdf[inputdf['Year']==year]
elif split_type == 'function':
from sklearn.model_selection import train_test_split
train, test = train_test_split(inputdf, test_size=ts)
if p == 'y':
coef_list = []
for v in feature_names:
count = 1
names = ([])
while count <= 10: #assembling list of columns without typing them in
t1 = 'p{}0'.format(count)
t2 = '_{}'.format(v)
t3 = t1+t2
names.append(t3)
count +=1
coef_list.extend(names)
feature_names = coef_list
elif p == 'n':
train = train.reset_index()
test = test.reset_index()
#separate into X and y and create dictionary
xy = {}
xy['X'] = train[feature_names].dropna().values #convert features into numpy array
try:
xy['y'] = train['yield'].dropna().values #convert into numpy array
xy['y_test'] = test['yield'].dropna().values
except KeyError:
train['yield'] = np.nan
test['yield'] = np.nan
xy['y'] = train['yield']
xy['y_test'] = test['yield']
xy['X_test'] = test[feature_names].dropna().values
return train, test, xy, feature_names
def ridgemodel(self, test, xy, A):
from sklearn.linear_model import Ridge
#get X and y
X = xy.get('X')
y = xy.get('y')
# Fit the model
ridgemodel = Ridge(alpha=A)
ridgemodel.fit(X, y)
score = ridgemodel.score(X, y)
#get X_test and y_test
X_test = xy.get('X_test')
y_test = xy.get('y_test')
#use X_test to predict values for y in y_test
y_pred = pd.DataFrame(ridgemodel.predict(X_test),columns=['yield'],index=test.index)
#collect returned values into dictionary
modeldata = {}
modeldata['score'] = score
modeldata['y_test'] = y_test
modeldata['y_pred'] = y_pred
modeldata['model'] = 'ridge'
return modeldata
def stat_array(self, modeldata):
y_test = modeldata.get('y_test')
y_pred = modeldata.get('y_pred')
#y_pred = y_pred2['yield'].values
i = modeldata.get('type')
def list_to_string(array):
['{:.4f}'.format(x) for x in array]
ls = array.tolist()
sstring = "!".join(map(str, ls)) #single string
return sstring
stat_array = pd.DataFrame(index=[i],columns=['MAPE', 'MAE', 'RMSE', 'R^2', 'y_test', 'y_pred'])
try:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False) #updated
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100 #code from web statology.org https://www.statology.org/mape-python/
#print stats
stat_array.loc[i,'R^2'] = modeldata.get('score')
stat_array.loc[i, 'RMSE'] = rmse
stat_array.loc[i, 'MAPE'] = mape
stat_array.loc[i, 'MAE'] = mae
except ValueError:
pass
yt_s = [ y_test ] #list_to_string(y_test)
yp_s = [ y_pred ] #list_to_string(y_pred)
stat_array.loc[i, 'y_test'] = yt_s
stat_array.loc[i, 'y_pred'] = yp_s
return stat_array
def rfrmodel(self, test, xy, t):
#get X and y from xy
X = xy.get('X')
y = xy.get('y')
# Instantiate a Random Forest Model
from sklearn.ensemble import RandomForestRegressor
rfrmodel = RandomForestRegressor(n_estimators=t) # 100 is default value for n_estimators
rfrmodel.fit(X, y)
#score = rfrmodel.score(X,y)
#get X_test and y_test from xy
X_test = xy.get('X_test')
y_test = xy.get('y_test')
#predict y_test in y_pred
y_preddf = pd.DataFrame(rfrmodel.predict(X_test),columns=['yield'],index=test.index)
y_pred = y_preddf['yield'].values
modeldata = {}
try:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)
#collect returned values into dictionary
modeldata['score'] = score
except ValueError:
modeldata['score'] = np.NaN
modeldata['y_test'] = y_test
modeldata['y_pred'] = y_pred
modeldata['type'] = 'rfr'
feat_i = rfrmodel.feature_importances_
return modeldata, feat_i
def merfmodel(self, train, test, xy, mi):
#create model inputs from train and test
X = xy.get('X')
y = xy.get('y')
X_test = xy.get('X_test')
y_test = xy.get('y_test')
Z_train = np.ones((len(X), 1))
Z_test = np.ones((len(X_test), 1)) #added after finding merf documentation
#copied from google colab Copy of SepProjectCode.ipynb
from merf.merf import MERF
#from catboost import CatBoostRegressor
#regr = CatBoostRegressor(iterations=500, silent=True)
import xgboost as xgb
regr = xgb.XGBRegressor(num_parallel_tree = 85)
merfmodel = MERF(regr, max_iterations=mi)
# Mixed Effects Random Forest Training
merf_cluster = 'ADM1_NAME'
clusters_train = train[merf_cluster] #names of regions??
clusters_test = test[merf_cluster]
merfmodel.fit(X, Z_train, clusters_train, y)
#predict y_hat - called y_pred for conveinence
y_predarray = merfmodel.predict(X_test, Z_test, clusters_test) #separated into two lines after meeting
y_preddf = pd.DataFrame(y_predarray, columns=['yield'], index=test.index)
y_pred = y_preddf['yield'].values
#collect returned values into dictionary
modeldata = {}
try:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
modeldata['score'] = score
except:
pass
modeldata['y_test'] = y_test
modeldata['y_pred'] = y_pred
modeldata['type'] = 'merf'
feat_i = merfmodel.trained_fe_model.feature_importances_
return modeldata, feat_i
def xgbreg(self, train, test, xy,pt):
import xgboost as xgb
#pt: num_parallel_trees
X_test = xy.get('X_test')
y_test = xy.get('y_test')
X_train = xy.get('X')
y_train = xy.get('y')
dvalid = xgb.DMatrix(X_test, y_test) #didn't work when ran with XGBRegressor, switched to DMatrix
dtrain = xgb.DMatrix(X_train, y_train)
params = {'objective':'reg:squarederror','booster':'gbtree','num_parallel_tree':pt}
#'colsample_bynode': trial.suggest_float('colsample_bynode', 1e-8, 0.999, log=True)
bst = xgb.train(params=params,dtrain=dtrain,evals=[(dvalid, 'eval')])
preds = bst.predict(dvalid)
#metrics
from sklearn import metrics
score = metrics.r2_score(y_test,preds)
#collect returned values into dictionary
modeldata = {}
modeldata['score'] = score
modeldata['y_test'] = y_test
modeldata['y_pred'] = preds
modeldata['type'] = 'xgb'
return modeldata
def feat_imp_graph(self, df, W_or_A, model):
import seaborn as sns
import matplotlib.pyplot as plt
mod_df = df.loc[(df['Model']==model)]
if W_or_A == 'Weight':
fi = mod_df.groupby(['Feature Name']).agg(**{"Weight":('Importance', 'sum')}).reset_index()
title_ = 'Summed Feature Importances of Various {} Runs'.format(model)
if W_or_A == 'Average':
fi = mod_df.groupby(['Feature Name']).agg(**{"Average":('Importance', 'mean')}).reset_index()
title_ = 'Averaged Feature Importances of Various {} Runs'.format(model)
fi2 = fi.sort_values(by=[W_or_A], ascending=False)
ax =sns.barplot(x="Feature Name", y=W_or_A, data=fi2[:15])
#plot top 15 values
for item in ax.get_xticklabels():
item.set_rotation(90)
plt.title(title_)
plt.show()
return ax
#will create separate function for feature generation
def input_extraction(feature_names, percentiles, csv, mon=False):
#create function that gives options on how our input data looks and eventually returns a usable dataset for models
#inputs - feature_names: list of all features from main 5 that we want included, should be as list type
#percentiles (options are 'yes' or 'no') : decides if we want features split into percentiles or just left as is
#csv = pd.read_csv('/gpfs/data1/cmongp1/ginsburga/sp24_2/kenya_maize_a3b.csv')
#mon - this input decides if you would like to create subseasonal forecasts excluding certain months in the growing season
if mon != False:
#mon is input dataset for year where predictions are being generated
#exclusion of data is specifically for testing data only
test = pd.read_csv('/gpfs/data1/cmongp1/ginsburga/sp24_2/maize_eo_2023.csv')
#test = mon.copy()
#test = csv3[csv3.Year.isin([2019,2023])]
test['date']=pd.to_datetime(test['time'])
test['month'] = pd.DatetimeIndex(test['date']).month
csv = test[test['month']<mon]
if mon == False:
#csv2 = csv[csv['crop_cal'] != 0]
pass
csv.sort_values(['ADM1_NAME','date'],inplace=True)
try:
df = csv[['ADM1_NAME', 'Year', 'yield', 'ndvi', 'tmax', 'tmin', 'esi', 'cmax','cmin','fldas']]
except KeyError:
df = csv[['ADM1_NAME', 'Year', 'ndvi', 'tmax', 'tmin', 'esi', 'cmax','cmin','fldas']]
df['yield'] = np.nan
res = df.Year.isin([2019]).any().any()
res2 = df.loc[(df.Year <=2016)].any().any()
if (res | res2):
df.dropna(subset=['yield'],inplace=True)
else:
pass
#df = csv[['ADM1_NAME', 'Year', 'yield', 'ndvi', 'tmin', 'esi','fldas']]
if percentiles == 'no':
# subset to the following columns: Season, yield, ndvi
df_ml = df.groupby(['ADM1_NAME', 'Year']).agg(**{"ndvi":('ndvi', 'max'), "esi":('esi', 'mean'), "tmax":('tmax', 'mean'),"tmin":('tmin', 'mean'), "cmax":("cmax","sum"),"cmin":("cmin",'sum'), "fldas":('fldas','mean'),"yield":('yield', 'mean')})
#df_ml = df.groupby(['ADM1_NAME', 'Year']).agg(**{"ndvi":('ndvi', 'max'), "esi":('esi', 'mean'), "tmin":('tmin', 'mean'), "fldas":('fldas','mean'), "yield":('yield', 'mean')})
include = ['yield'] + feature_names
df = df_ml[include].reset_index()
# Drop any rows where any information is missing
elif percentiles == 'yes':
#divvy up variable info into percentiles
df2 = df.ffill()
groups = df2.groupby(['ADM1_NAME', 'Year']) #does something to align these two columns
frames = []
feature_names = ['ndvi', 'esi', 'tmax', 'tmin', 'cmax','cmin','fldas']
#groupshead = groups[0:5,:]
for key, vals in groups: # keys refers to group by features, vals = all of the data in the df
df_o= pd.DataFrame()
df_o2 = pd.DataFrame()
percentiles = ['p10', 'p20', 'p30', 'p40', 'p50', 'p60', 'p70', 'p80', 'p90', 'p100']
#percentiles = ['p40']
vals.loc[:, 'Fraction Season'] = range(1, len(vals) + 1)
vals.loc[:, 'Fraction Season'] = vals.loc[:, 'Fraction Season'] * 100 / len(vals)
for var in feature_names:
if var in vals:
for f in percentiles:
perc = int(f[1:])
closest = vals['Fraction Season'].sub(perc).abs().idxmin(skipna=True) #subtract percentile
#print([vals.loc[closest][var]])
#split
df_o[f'{f}_{var}'] = [vals.loc[closest][var]]
df_o['ADM1_NAME'] = vals.iloc[0,0]
df_o['Year'] = vals.iloc[0,1]
df_o['yield'] = vals['yield'].unique()[0]
df_o2= pd.concat([df_o2, df_o],axis=0)
frames.append(df_o) #format of percentiles is translated to an array
df_full = pd.concat(frames)
df_full['Year'] = df_full['Year'].astype(int) #code was returning key error likely because years were strings
#print(df_full)
if df_full['yield'].isna().sum() != 0:
df_full.drop(columns=['yield'],inplace=True)
else:
pass
df_full = df_full.query('2000 < Year < 2024') #modify frame
df_full= df_full.reset_index()
return df_full