maphel-langtime/popweigh_grid_trajectories.py at main · DigitalGeographyLab/maphel-langtime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# -*- coding: utf-8 -*-
"""
Created on Thu Oct  6 10:22:03 2022

@author: TuoVaisanen-e01
"""
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import argparse as ap

# set up argument parser
ap = argparse.ArgumentParser()

# Get grid file
ap.add_argument("-dg", "--divgrid", required=True,
                help="Path to folder containing grid diversity history files. For example: /path/to/folder/")

# Get path to input file
ap.add_argument("-sg", "--somagrid", required=True,
                help="Path to geopackage with grid history of Somali population count")

# Get path to input file
ap.add_argument("-eg", "--estogrid", required=True,
                help="Path to geopackage with grid history of Estonian population count")

# Get path to input file
ap.add_argument("-fg", "--fingrid", required=True,
                help="Path to geopackage with grid history of Finnish population count")

# Get path to input file
ap.add_argument("-hg", "--hmagrid", required=True,
                help="Path to geopackage with grid history of total population count")

# Get path to output file
ap.add_argument("-o", "--output", required=True,
                help="Path to output folder. For example: /path/to/folder/. This script assumes you have access to FOLK data within Fiona")

# parse arguments
args = vars(ap.parse_args())

def get_cell_trajectories(dataframe):

    # first drop NaNs
    stable = dataframe.dropna()

    # get unstable grid cells from those not present in stable dataframe
    unstable = dataframe[~dataframe['euref_250'].isin(stable['euref_250'].values.tolist())]

    # get new and old cells
    new = unstable.dropna(subset=['2019'])
    old = unstable[~unstable['euref_250'].isin(new['euref_250'].values.tolist())]

    # return dataframes
    return stable, new, old

def get_popweigh_values(valdf, popdf, outputtype):

    # initialize returnable dataframe
    result = pd.DataFrame()

    # loop over years in the data
    for year in range(1987, 2020):

        # get year as string
        column = str(year)

        # get population sum
        pop = popdf[column].sum()

        # get intermediate score
        score = valdf[column] * popdf[column]

        # get sum score
        score = score.sum()

        # get final score
        score = score / pop

        # add result in dataframe
        result.at[year, 'score'] = score
        result.at[year, 'year'] = year
        result.at[year, 'type'] = outputtype

    # return the population weighed dataframe
    return result

# initialize a outputlist
outputlist = []

# set labels
labs = ['Unique languages','Shannon entropy']

# loop over files
for x, file in enumerate(['unique_langs','shannon']):

    # read grid history files in
    df = gpd.read_file(args['divgrid'] + '{}_grid_history.gpkg'.format(file))
    sdf = gpd.read_file(args['somagrid'])
    edf = gpd.read_file(args['estogrid'])
    hma = gpd.read_file(args['hmagrid'])
    fin = gpd.read_file(args['fingrid'])

    # set somali annual range to start from 1992 to ensure enough observations
    sdf = sdf.drop(columns=['1987','1988','1989','1990','1991',])
    # drop all rows that have only zero observations throughout
    # to remove cells without any somali or estonian speaking inhabitants
    sdf = sdf.drop_duplicates(subset=list(sdf.columns[1:29]), keep=False)
    edf = edf.drop_duplicates(subset=list(edf.columns[1:34]), keep=False)
    fdf = fin.drop_duplicates(subset=list(fin.columns[1:34]), keep=False)

    # replace zeros with nans for population count dataframes
    edf = edf.replace(0, np.nan)
    sdf = sdf.replace(0, np.nan)
    fdf = fdf.replace(0, np.nan)

    # get dataframe triplets
    hma_ey, hma_new, hma_old = get_cell_trajectories(hma)
    est_ey, est_new, est_old = get_cell_trajectories(edf)
    som_ey, som_new, som_old = get_cell_trajectories(sdf)
    fin_ey, fin_new, fin_old = get_cell_trajectories(fdf)

    # result list
    resultlist = []

    # loop over dataframes
    for i, data in enumerate([som_ey, som_new, som_old, est_ey, est_new,
                              est_old, fin_ey, fin_new, fin_old]):

        # set year range
        if i <= 2:
            yrange = range(1992,2020)
        elif i > 2:
            yrange = range(1987,2020)

        # loop over years
        for year in yrange:

            # get current year from pop and value dataframe
            current = data[['euref_250', str(year)]]
            vals = df[['euref_250', str(year)]]

            # get grid ids from current year
            gids = current['euref_250'].values.tolist()

            # get value grids present in language group dataframe
            vals = vals[vals['euref_250'].isin(gids)]

            # get population sum for population weighting
            pop = current[str(year)].sum()

            # get intermediate score
            score = vals[str(year)] * current[str(year)]

            # get sum score for population weighting
            score = score.sum()

            # get final population weighted score
            score = score / pop

            # create a dataframe
            result = pd.DataFrame()

            # add values to dataframe
            result.at[year, 'year'] = year
            result.at[year, 'score'] = score

            # add correct values
            if i == 0:
                result.at[year, 'type'] = 'Somali-inhabited'
                result.at[year, 'celltype'] = 'Present every year'
            elif i == 1:
                result.at[year, 'type'] = 'Somali-inhabited'
                result.at[year, 'celltype'] = 'New'
            elif i == 2:
                result.at[year, 'type'] = 'Somali-inhabited'
                result.at[year, 'celltype'] = 'Ceased'
            elif i == 3:
                result.at[year, 'type'] = 'Estonian-inhabited'
                result.at[year, 'celltype'] = 'Present every year'
            elif i == 4:
                result.at[year, 'type'] = 'Estonian-inhabited'
                result.at[year, 'celltype'] = 'New'
            elif i == 5:
                result.at[year, 'type'] = 'Estonian-inhabited'
                result.at[year, 'celltype'] = 'Ceased'
            elif i == 6:
                result.at[year, 'type'] = 'Finnish-inhabited'
                result.at[year, 'celltype'] = 'Present every year'
            elif i == 7:
                result.at[year, 'type'] = 'Finnish-inhabited'
                result.at[year, 'celltype'] = 'New'
            elif i == 8:
                result.at[year, 'type'] = 'Finnish-inhabited'
                result.at[year, 'celltype'] = 'Ceased'

            # append to list
            resultlist.append(result)

    # concatenate dataframes
    result = pd.concat(resultlist)
    result.to_pickle(args['output'] + 'new_old_neighbourhoods_popweigh_{}.pkl'.format(file))


    # drop rows with ceased
    result = result[result['celltype'] != 'Ceased']

    # set name of metric
    result['metric'] = file

    # send to output dataframe list
    outputlist.append(result)

    # plot temporal development of grid cells
    sns.set(font_scale=1.3)
    fig, ax = plt.subplots(figsize=(7,6))
    palette = {'Somali-inhabited':'C1','Estonian-inhabited':'C0','Finnish-inhabited':'C3'}
    g = sns.lineplot(x='year', y='score', hue='type', style='celltype', data=result,
                     ci=99, estimator=np.nanmean, n_boot=100, palette=palette, ax=ax)
    g.set(xlabel='', ylabel=labs[x])
    if x == 1:
        handles = ax.get_legend().legendHandles
        ax.get_legend().remove()
        ax.legend(handles, ['Residential neighbourhood', 'Somali-inhabited','Estonian-inhabited', 'Finnish-inhabited',
                           '\nTemporal type','Present every year', 'New grid cells'])
    else:
        handles = ax.get_legend().legendHandles
        ax.get_legend().remove()
    plt.savefig(args['output'] + 'grid_trajectory_popweigh_{}.pdf'.format(file), dpi=300,
                bbox_inches='tight')

# concatenate the outputs into a dataframe
output = pd.concat(outputlist, ignore_index=True)

# save dataframe
output.to_pickle(args['output'] + 'popweigh_divs_gridtypes.pkl')
output.to_csv(args['output'] + 'popweigh_divs_gridtypes.csv', sep=';',encoding='utf-8')