maphel-langtime/HMA_overall_diversityplot.py at main · DigitalGeographyLab/maphel-langtime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  4 14:12:32 2022

@author: TuoVaisanen-e01
"""

import pandas as pd
import geopandas as gpd
from collections import Counter
import skbio.diversity.alpha as sk
import numpy as np
import gc
from matplotlib import pyplot as plt
import seaborn as sns
import argparse

# set up argument parser
ap = argparse.ArgumentParser()

# Get path to input file
ap.add_argument("-g", "--grid", required=True,
                help="Path to geopackage file containing 250 m grid covering the HMA.")

# Get path to input file
ap.add_argument("-lg", "--langgrid", required=True,
                help="Path to folder containing CSVs with first language information and individual unique ids")

# Get path to input file
ap.add_argument("-hg", "--homegrid", required=True,
                help="Path to folder containing CSVs with home locations and individual unique ids")

# Get path to output file
ap.add_argument("-o", "--output", required=True,
                help="Path to output folder. For example: /path/to/folder/. The files will be named: '_langfam_diversity_euref250.pkl'")

# parse arguments
args = vars(ap.parse_args())

# function to count languages in grid and return list of counts
def langcount(langlist):
    count = Counter(langlist)
    return list(count.values())

# read grid in
print('[INFO] - Reading spatial information...')
grid = gpd.read_file(args['grid'])

# reduce grid to only HMA
grid = grid[grid['KUNTA'].isin(['091', '049', '092', '235'])]

# convert grid id to integer
grid['NRO'] = grid['NRO'].astype(int)

#get grid ids to list
gids = grid['NRO'].values.tolist()

# result list
resultlist = []

# loop over years 1987-2019 (range ends one before, so 2020 is used)
for i in range(1987,2020):
    print('[INFO] - Processing year {}...'.format(str(i)))
    # get year
    yr = i

    # create year-specific filepaths
    lpath = args['langgrid'] + str(yr) + '_mothertongues.csv'
    gpath = args['homegrid'] + 'henkilo_paikkatiedot_' + str(yr) +'.csv'
    outpath = args['output'] + str(yr) + '_langfam_diversity_euref250.pkl'

    # read annual datasets in
    langs = pd.read_csv(lpath, sep=',', encoding='utf-8')
    grids = pd.read_csv(gpath, sep=',', encoding='utf-8')

    # merge on unique individual ids
    data = grids.merge(langs[['shnro','kieli']], on='shnro')

    # drop euref 1000 column and then NaN rows in euref 250
    data = data.drop(columns=['euref_1000']).dropna()

    # convert grid id to integers
    data['euref_250'] = data['euref_250'].astype(int)

    # get only HMA grids
    data = data[data['euref_250'].isin(gids)]

    # join language family and genus data
    data = pd.merge(data, df[['alpha2', 'Family', 'Genus']], left_on='kieli', right_on='alpha2', how='left')

    # get annual metrics for whole HMA
    shannon = sk.shannon(langcount(data['kieli'].values.tolist()), base=np.e)
    fshannon = sk.shannon(langcount(data['Family'].values.tolist()), base=np.e)
    unique = sk.observed_otus(langcount(data['kieli'].values.tolist()))
    funique = sk.observed_otus(langcount(data['Family'].values.tolist()))
    menhi = sk.menhinick(langcount(data['kieli'].values.tolist()))
    brillouin = sk.brillouin_d(langcount(data['kieli'].values.tolist()))
    heip = sk.heip_e(langcount(data['kieli'].values.tolist()))
    pielou = sk.pielou_e(langcount(data['kieli'].values.tolist()))
    margalef = sk.margalef(langcount(data['kieli'].values.tolist()))
    mcintosh = sk.mcintosh_d(langcount(data['kieli'].values.tolist()))
    simpson = sk.simpson(langcount(data['kieli'].values.tolist()))

    fisecount = data['kieli'].value_counts().fi + data['kieli'].value_counts().sv
    focount = len(data) - fisecount

    # create dataframe with values
    result = pd.DataFrame()

    # add values
    result.at[yr, 'year'] = yr
    result.at[yr, 'shannon'] = shannon
    result.at[yr, 'fam_shannon'] = fshannon
    result.at[yr, 'unique_langs'] = unique
    result.at[yr, 'unique_fam'] = funique
    result.at[yr, 'menhinick'] = menhi
    result.at[yr, 'brillouin'] = brillouin
    result.at[yr, 'heip'] = heip
    result.at[yr, 'pielou'] = pielou
    result.at[yr, 'margalef'] = margalef
    result.at[yr, 'mcintosh'] = mcintosh
    result.at[yr, 'simpson'] = simpson
    result.at[yr, 'natpop'] = fisecount
    result.at[yr, 'forpop'] = focount

    # append
    resultlist.append(result)

# create result dataframe
results = pd.concat(resultlist, ignore_index=True)

# predicted dataframe from City of Helsinki databases, see article's bibliography for accurate reference
preds = {'year':[2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,
                 2032,2033,2034],
         'natpop':[988662,994050,999435,1003592,1006967,1009633,1012031,1014017,
                   1015272,1016151,1016151,1015882,1012678,1008809,1004475],
         'forpop':[216583,228244,240123,252215,264516,277019,289729,302650,
                   315783,329127,342678,356442,370427,384638,399077]}

# create dataframe
preds = pd.DataFrame().from_dict(preds)

# combine preds
results = pd.concat([results, preds], ignore_index=True)

# save results to disk
results.to_pickle(args['output'] + 'HMA_overall_diversities.pkl')

# initialize seaborn style
sns.set()

# plot the predicted nations
fig, ax = plt.subplots(figsize=(6,5))
g = sns.lineplot(x='year', y='natpop', color='orchid', data=results, ax=ax)
g = sns.lineplot(x='year', y='forpop', color='teal', data=results, ax=ax)
g.set(ylabel='Population', xlabel='')
plt.axvline(2019, 0.05, 0.96, color='k', linestyle='--')
plt.yticks(rotation=45)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.legend(labels=['Native', 'Non-native'], loc='lower right')
plt.savefig(args['output'] + 'hma_population_preds.pdf', dpi=300, bbox_inches='tight')

# plot triple stacked plot for figure 1'
sns.set(font_scale=1.4)
fig, ax = plt.subplots(3,1, figsize=(5,15))
ax = ax.flatten()
g = sns.lineplot(x='year', y='natpop', color='orchid', data=results, ax=ax[0])
g = sns.lineplot(x='year', y='forpop', color='slateblue', data=results, ax=ax[0])
g.set(xlabel='')
g.set_ylabel('Population', fontsize=16)
ax[0].axvline(2019, 0.05, 0.96, color='k', linestyle='--')
ax[0].ticklabel_format(style='plain', axis='y', useOffset=False)
ax[0].tick_params(labelrotation=25)
ax[0].set_yticklabels([-200000, 0, 200000, 400000, 600000, 800000, 1000000, 1200000], rotation=45)
ax[0].legend(labels=['Native', 'Non-native'], loc='center left')
g = sns.lineplot(x='year', y='unique_langs', color='cadetblue', data=results, ax=ax[1])
g.set(xlabel='', ylim=(0,160))
g.set_ylabel('Unique languages', fontsize=16)
g = sns.lineplot(x='year', y='unique_fam', color='olivedrab', data=results, ax=ax[2])
g.set(xlabel='', ylim=(0,14))
g.set_ylabel('Language families', fontsize=16)
plt.savefig(args['output'] + 'figure1_bcd.pdf', dpi=400, bbox_inches='tight')


# plot global shannon
fig, ax = plt.subplots(1, 2, figsize=(14,5))
ax = ax.flatten()
g = sns.lineplot(x='year', y='shannon', color='olivedrab', data=results, ax=ax[0])
g.set(ylabel='Shannon entropy', xlabel='', ylim=(0,1.3))
g.set_title('a.', loc='left', fontsize=15)
g = sns.lineplot(x='year', y='unique_langs', color='salmon', data=results, ax=ax[1])
g.set(ylabel='Unique languages', xlabel='', ylim=(0,160))
g.set_title('b.', loc='left', fontsize=15)

plt.savefig(args['output'] + 'global_shannon_uniques_hma.pdf', dpi=300, bbox_inches='tight')


# plot the global diversities
fig, axes = plt.subplots(2,2, figsize=(12,10))
axes = axes.flatten()
g = sns.lineplot(x='year', y='shannon', data=results, ax=axes[2], color='olivedrab')
g.set(xlabel='', ylabel='Shannon entropy', ylim=(0,1.3))
g = sns.lineplot(x='year', y='fam_shannon', data=results, ax=axes[3], color='teal')
g.set(xlabel='', ylabel='Shannon entropy', ylim=(0,1.3))
g = sns.lineplot(x='year', y='unique_langs', data=results, ax=axes[0], color='salmon')
g.set(xlabel='', ylabel='Languages', ylim=(0,160))
g = sns.lineplot(x='year', y='unique_fam', data=results, ax=axes[1], color='goldenrod')
g.set(xlabel='', ylabel='Language families', ylim=(0,14))
plt.savefig(args['output'] + 'afinla_HMA_87-19.pdf', dpi=300, bbox_inches='tight')