ABS-Region-Data/cleanfunc.py at master · chrismelba/ABS-Region-Data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import os
from textwrap import wrap
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import eli5
import re
import geocoder
import operator
from bokeh.models import HoverTool
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource

nb_path = os.getcwd()
nb_path

def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    output_notebook()

    hover = HoverTool()
    test_source = ColumnDataSource(data=dict(
        x=y_test,
        y=y_pred,
        desc=y_test.index,
    ))

    train_source = ColumnDataSource(data=dict(
        x=y_train,
        y=y_train_pred,
        desc=y_train.index,
    ))

    p = figure(plot_width = 600, plot_height = 400, x_axis_label = 'Acutal', y_axis_label = 'Prediction')

    p.circle('x', 'y', size=20, color = 'navy', alpha = 0.5, source = test_source, legend = 'Test Data')
    p.circle('x', 'y', size=10, color = 'red', alpha = 0.5, source = train_source, legend = 'Train Data')
    p.line([0,max(y_test)],[0, max(y_test)], legend = 'line of perfect fit')
    hover.tooltips = [
        ("index", '$index'),
        ("(x,y)", "($x, $y)"),
        ("desc", "@desc"),
    ]

    p.tools.append(hover)
    p.legend.location = 'top_left'
    p.legend.click_policy="hide"
    p.toolbar.logo = None
    p.toolbar_location = None
    show(p)
    print("Train data R2 score: {}".format(r2_score(y_train, y_train_pred)))
    print("Test data R2 score: {}".format(r2_score(y_test, y_pred)))
    return y_pred, y_train_pred


def sort_series_abs(S):
    'Takes a pandas Series object and returns the series sorted by absolute value'
    temp_df = pd.DataFrame(S)
    temp_df['abs'] = temp_df.iloc[:,0].abs()
    temp_df.sort_values('abs', ascending = False, inplace = True)
    return temp_df.iloc[:,0]

def clean_xls(file):
    df = pd.read_excel(file,skiprows=5, header = [0, 1, 2, 3], skipfooter = 6)
    unnamed_str = 'Unnamed: [0-9]+_level_[0-9]+'
    df.columns = [re.sub(unnamed_str, '', ' '.join(col)).strip() for col in df.columns.values]
    return df

def remove_aggregation(df):
    df.drop(df[df['CODE'].astype(str).map(len) <= 8].index, inplace=True)
    return df

def load_merge_clean(nb_path = os.getcwd()):
    # Create list of all files in CSV directory
    files = []
    for (dirpath, dirnames, filenames) in os.walk('{}\CSV'.format(nb_path)):
        files.extend(filenames)
        break
    for f in files:
        if f[-4:] != '.csv':
            files.remove(f)

    # Read in csv files and merge into 1 dataframe
    # Initialise dataframe with first csv file
    df = pd.read_csv('{}\CSV\{}'.format(nb_path, files[0]), na_values='-', thousands=',')
    # loop through the remainder of CSVs and load them in
    for file in range(1,len(files)):
        df_temp = pd.read_csv('{}\CSV\{}'.format(nb_path, files[file]), na_values='-', thousands=',')
        df_temp = remove_aggregation(df_temp)
        df = pd.merge(df, df_temp, how='inner', left_on=['CODE','YEAR','LABEL'], right_on=['CODE','YEAR','LABEL'])

    df.set_index(['CODE', 'LABEL', 'YEAR'], inplace=True)

    df = clean_data(df, 'CODE')

    df = df.xs(2016, level = 'YEAR')

    latlng = pd.read_csv('latlng.csv')

    df = pd.merge(df, latlng, how = 'left', left_on=['LABEL'], right_on = ['LABEL'])
    df = df.set_index(['LABEL'])

    return df


def clean_data(df, fill_mean_subset = None):
    '''
    A function to clean a dataframe and return X & y values for further processing.
    Rows are removed where NaNs are present in response vector records.
    NaN values for all other features are filled with the mean of the feature.

    INPUT
    df - pandas dataframe
    y_column - String. Name of column to be used as the response vector
    fill_mean_subset - String, column name. Allows the input of a column to "subset" when first completing
                        imputing missing numerical values with a series mean. E.g. if there is a categorical
                        field of "year", allows imputing of null values with the mean of each year, rather
                        than the mean of the overall series.

    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    '''
    # Remove duplicate columns
    drop_cols = []
    check_cols = df.columns.tolist()
    check_cols.sort()
    w_end = len(check_cols)
    i = 0

    # Cycle through each column name
    while i < w_end:
        # assign a Check variable the the column name as a string
        # that name string should only include characters up to 1 character after the final space
        # e.g. "* %" or "* n"
        check_str = check_cols[i]
        check_str = check_str[:(check_str.rfind(" ")+2)]

        for col in check_cols[(i+1):]:
            # look forward in the list of column names for any other items matching CheckString & "*"
            # add any matches to a list to drop, drop from the "check" list as well so make further searches more efficient.
            # I'm almost certain there is a more efficient way to do this list/dict-wise
            if col.startswith(check_str):
                drop_cols.append(col)
                check_cols.remove(col)
                w_end -= 1
        i += 1

    df.drop(drop_cols, axis = 1, inplace=True)

    # Drop empty columns
    df = df.dropna(how = 'all', axis = 1)


    # Fill numeric columns with the mean
    num_vars = df.select_dtypes(include=['float', 'int']).columns
    # First, fill with the mean of the subset based on given category
    if fill_mean_subset != None:
        index_reset = False
        index_names = list(df.index.names)

        # Filtering sucks with multi-indexing so temporarily reset the indexes for this action
        if fill_mean_subset in index_names:
            index_reset = True
            df.reset_index(inplace=True)

        #Check if subset variable is an index item
        for subset_item in df[fill_mean_subset].unique().tolist():
            for col in num_vars:
                subset_mean = df[df[fill_mean_subset] == subset_item][col].mean()
                df.loc[(df[fill_mean_subset] == subset_item) & (df[col].isnull()), col] = subset_mean

        if index_reset:
            df.set_index(index_names, inplace=True)

    # For any remaining nulls, fill with the mean of the overall series
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace=True)

    # OHE the categorical variables
    cat_vars = df.select_dtypes(include=['object']).copy().columns
    for var in  cat_vars:
        # for each cat add dummy var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

    # Fill OHE NaNs with 0
    # Get list of columns after OHE that were not in the "numeric" list from earlier, using set function for speed.
    cat_vars = list(set(df.columns.tolist()) - set(num_vars.tolist()))
    for var in cat_vars:
        df[var].fillna(0, inplace=True)


    return df

def buildlatlng(df):
    '''
    buildlatlng
    A function to build a database of latitudes and longitudes for the australian census regions
    Inputs - df: a dataframe with index.levels[1] of the names of all the suburbs
    Outputs - latlng: a dataframe with the corresponding latitudes and longitudes looked up from openstreetmaps
    '''
    addresses = df.index.levels[1].tolist()
    lat = []
    lng = []
    for address in addresses:
        #I found OSM struggles with the word region, so best to remove
        address = re.sub('Region', '', address)

        #Try a variety of different spellings of the word, check that they exist and are in Australia then append to the lat/long lists and continue to next cycle once one works
        try:
            callstr = address
            print(callstr)
            g = geocoder.osm(callstr)
            assert g.country == 'Australia'
            lat.append(g.latlng[0])
            lng.append(g.latlng[1])
            print(g)
            continue
        except:
            pass

        try:
            callstr = address.split('-', 1)[0]+' Australia'
            print(callstr)
            g = geocoder.osm(callstr)
            assert g.country == 'Australia'
            lat.append(g.latlng[0])
            lng.append(g.latlng[1])
            print(g)
            continue
        except:
            pass

        try:
            callstr = address.split(' ', 1)[0]+' Australia'
            print(callstr)
            g = geocoder.osm(callstr)
            assert g.country == 'Australia'
            lat.append(g.latlng[0])
            lng.append(g.latlng[1])
            print(g)
            continue
        except:
            pass

        try:
            callstr = address.split('-', 1)[0]
            print(callstr)
            g = geocoder.osm(callstr)
            assert g.country == 'Australia'
            lat.append(g.latlng[0])
            lng.append(g.latlng[1])
            print(g)
            continue
        except:
            pass


        lat.append(None)
        lng.append(None)

    #because of multiple spelling of Woolaware/Wooloware we have a single error to fix
    callstr = 'woolooware'
    print(callstr)
    g = geocoder.osm(callstr)
    lat[550] = g.latlng[0]
    lng[550] = g.latlng[1]

    latlng = pd.DataFrame({'lat':lat, 'long':lng}, index = X.index.levels[1].tolist())
    return latlng

def feature_impact_plot(model, X_train, n_features, y_label):
    '''
    Takes a trained model and training dataset and synthesises the impacts of the top n features
    to show their relationship to the response vector (i.e. how a change in the feature changes
    the prediction). Returns n plots showing the % variance for the median predictions made.

    INPUTS
    model = Trained model in sklearn with  variable ".feature_importances_". Trained supervised learning model.
    X_train = Pandas Dataframe object. Feature set the training was completed using.
    n_features = Int. Top n features you would like to plot.
    y_label = String. Description of response variable for axis labelling.

    Currently the min, max, 1Q and 3Q are also stored in the variable, but plotting the % impacts
    for these makes the chart less intuitive so I recommend against plotting these all together.
    There may be use in adding functionality allowing the selection of the quartile displayed?
    '''
    # Display the n most important features
    indices = np.argsort(model.feature_importances_)[::-1]
    columns = X_train.columns.values[indices[:n_features]]

    # Create a list object to capture the simulated prediction % variances to later turn into a DataFrame
    sim_var = [[]]

    for col in columns:
        base_pred = model.predict(X_train)
        #add percentiles of base predictions to a df for use in reporting
        base_percentiles = [np.percentile(base_pred, pc) for pc in range(0,101,25)]

        # Create new predictions based on tweaking the parameter
        # copy X, resetting values to align to the base information through different iterations
        df_copy = X_train.copy()

        for val in np.arange(-X_train[col].std(), X_train[col].std(), X_train[col].std()/50):
            df_copy[col] = X_train[col] + val
            # Add new predictions based on changed database
            predictions = model.predict(df_copy)

            # Add percentiles of these predictions to a df for use in reporting
            percentiles = [np.percentile(predictions, pc) for pc in range(0,101,25)]

            # Add variances between percentiles of these predictions and the base prediction to a df for use in reporting
            percentiles = list(map(operator.sub, percentiles, base_percentiles))
            percentiles = list(map(operator.truediv, percentiles, base_percentiles))
            sim_var.append([val, col] + percentiles)

    # Create a dataframe based off the arrays created above
    df_predictions = pd.DataFrame(sim_var,columns = ['Value','Feature']+[0,25,50,75,100])

    # Create a subplot object based on the number of features
    num_cols = 2
    subplot_rows = int(n_features/num_cols) + int(n_features%num_cols)
    fig, axs = plt.subplots(nrows = subplot_rows, ncols = num_cols, sharey = True, figsize=(15,5*subplot_rows))


    # Plot the feature variance impacts
    for i in range(axs.shape[0]*axs.shape[1]):
        if i < len(columns):
            # Cycle through each plot object in the axs array and plot the appropriate lines
            ax_row = int(i/num_cols)
            ax_column = int(i%num_cols)

            axs[ax_row, ax_column].plot(df_predictions[df_predictions['Feature'] == columns[i]]['Value'],
                     df_predictions[df_predictions['Feature'] == columns[i]][50])

            axs[ax_row, ax_column].set_title("\n".join(wrap(columns[i], int(100/num_cols))))

            # Create spacing between charts if chart titles happen to be really long.
            nlines = max(nlines, axs[ax_row, ax_column].get_title().count('\n'))

            # Format the y-axis as %
            if ax_column == 0:
                vals = axs[ax_row, ax_column].get_yticks()
                axs[ax_row, ax_column].set_yticklabels(['{:,.2%}'.format(x) for x in vals])
                axs[ax_row, ax_column].set_ylabel('% change to {}'.format(y_label))

        # If there is a "spare" plot, hide the axis so it simply shows ans an empty space
        else:
            axs[int(i/num_cols),int(i%num_cols)].axis('off')

    # Apply spacing between subplots in case of very big headers
    fig.subplots_adjust(hspace=0.5*nlines)

    # Return the plot
    plt.tight_layout()
    plt.show()

def histo_plots(df, num_cols):
    '''
    Takes a dataframe object and plots each column as a histogram chart using the feature title as the
    chart title and the other columns as X-axis labels, within an n*(len/n) subplot frame
    df: dataframe object with values from which you want a chart of each row
    num_cols: the number of charts per output row
    '''
    rows = df.shape[0]
    columns = df.shape[1]
    plot_rows = (int(columns/num_cols) + ((columns%num_cols)!=0))
    fig, axs = plt.subplots(nrows = plot_rows,
                            ncols = num_cols, sharey = False, figsize=(15,columns))
    nlines = 1

    for i in range(axs.shape[0]*axs.shape[1]):
        if i < columns:
            try:
                df.iloc[:,i].dropna().plot.hist(ax = axs[int(i/num_cols),int(i%num_cols)])
                axs[int(i/num_cols),i%num_cols].set_title("\n".join(wrap(df.columns[i], int(100/num_cols))))
                nlines = max(nlines,axs[int(i/num_cols),int(i%num_cols)].get_title().count('\n'))

            except:
                axs[int(i/num_cols),i%num_cols].set_title("\n".join(wrap(df.columns[i], 100/num_cols)))
                axs[int(i/num_cols),i%num_cols].axis('off')
        else:
            axs[int(i/num_cols),int(i%num_cols)].axis('off')

    fig.subplots_adjust(hspace=0.5*nlines)
    plt.tight_layout()
    plt.show()

def feature_plot_h(model, X_train, n_features):
    '''
    Takes a trained model and training dataset and synthesises the impacts of the top n features.
    Returns a horizontal bar plot showing the top n features and their corresponding feature importance.

    INPUTS
    model = Trained model in sklearn with  variable ".feature_importances_". Trained supervised learning model.
    X_train = Pandas Dataframe object. Feature set the training was completed using.
    n_features = Int. Top n features you would like to plot.

    '''
    # Display the n most important features
    indices = np.argsort(model.feature_importances_)[::-1]
    columns = X_train.columns.values[indices[:n_features]]
    values = importances[indices][:n_features]

    columns = [ '\n'.join(wrap(c, 20)) for c in columns ]

    # Create the plot
    fig = plt.figure(figsize = (9,n_features))
    plt.title("Normalized Weights for {} Most Predictive Features".format(n_features), fontsize = 16)
    plt.barh(np.arange(n_features), values, height = 0.6, align="center", color = '#00A000',
          label = "Feature Weight")
    plt.barh(np.arange(n_features) - 0.3, np.cumsum(values), height = 0.2, align = "center", color = '#00A0A0',
          label = "Cumulative Feature Weight")
    plt.yticks(np.arange(n_features), columns)
    plt.xlabel("Weight", fontsize = 12)

    plt.legend(loc = 'upper right')

    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()