maphel-langtime/calculate_speakers.py at main · DigitalGeographyLab/maphel-langtime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 30 12:58:37 2022

@author: TuoVaisanen-e01
"""
import geopandas as gpd
import pandas as pd
import argparse

# set up argument parser
ap = argparse.ArgumentParser()

# Get path to input file
ap.add_argument("-gp", "--geopackage", required=True,
                help="Path to input 250 m grid (type geopackage).")

# Get path to input file
ap.add_argument("-m", "--mothertongues", required=True,
                help="Path to folder, containing information on first languages in CSVs.")

# Get path to input file
ap.add_argument("-g", "--grid", required=True,
                help="Path to folder containing grid ID information in CSVs.")

# Get path to input file
ap.add_argument("-d", "--diversity", required=True,
                help="Path to folder containing diversity metrics per grid in geopackages.")

# Get path to output file
ap.add_argument("-o", "--output", required=True,
                help="Path to output folder. For example: /path/to/folder/")


# parse arguments
args = vars(ap.parse_args())

# get hma grid
hma = gpd.read_file(args['geopackage'])

# get hma grid identifiers
hmagid = list(hma['NRO'].astype(int).values)

# loop over wanted years
for i in range(1987,2020):
    print('[INFO] - Processing year {}...'.format(str(i)))

    # create year-specific filepaths
    lpath = args['mothertongues'] + '{}_mothertongues.csv'.format(str(i))
    gpath = args['grid'] + 'henkilo_paikkatiedot_{}.csv'.format(str(i))
    dp = args['diversity'] + 'HMA_folk_processed_language_{}.gpkg'.format(str(i))

    # read annual datasets in
    langs = pd.read_csv(lpath, sep=',', encoding='utf-8')
    grids = pd.read_csv(gpath, sep=',', encoding='utf-8')

    # simplify grid to hma
    grids = grids.dropna(subset=['euref_250']).drop(columns=['euref_1000'])

    # grid id to integers
    grids['euref_250'] = grids['euref_250'].astype(int)

    # drop grids outside hma
    grids = grids[grids['euref_250'].isin(hmagid)]

    # combine language with home location
    langs = pd.merge(langs[['shnro','kieli']], grids, on='shnro', how='left')

    # delet grids from memory
    del grids

    # drop non-hma residents
    langs = langs.dropna(subset=['euref_250'])

    # grid id to integer
    langs['euref_250'] = langs['euref_250'].astype(int)

    # group by grid id
    grouped = langs.groupby(by=['euref_250'])['kieli'].apply(list).reset_index()

    # delete langs from memory
    del langs

    # calcualate population
    grouped['population'] = grouped['kieli'].apply(len)

    # calculate the number of specific speakers
    grouped['finpop'] = grouped['kieli'].apply(lambda x: x.count('fi'))
    grouped['swepop'] = grouped['kieli'].apply(lambda x: x.count('sv'))
    grouped['estpop'] = grouped['kieli'].apply(lambda x: x.count('et'))
    grouped['sompop'] = grouped['kieli'].apply(lambda x: x.count('so'))
    grouped['finswe_pop'] = grouped['finpop'] + grouped['swepop']
    grouped['foreign_pop'] = grouped['population'] - grouped['finswe_pop']

    # calculate the proportion of official language speakers
    grouped['finswe_prop'] = round((grouped['finswe_pop'] / grouped['population']) * 100, 3)
    grouped['swe_prop'] = round((grouped['swepop'] / grouped['population']) * 100, 3)
    grouped['fin_prop'] = round((grouped['finpop'] / grouped['population']) * 100, 3)

    # calculate the proportion of the two minority language speakers
    grouped['est_prop'] = round((grouped['estpop'] / grouped['population']) * 100, 3)
    grouped['som_prop'] = round((grouped['sompop'] / grouped['population']) * 100, 3)
    grouped['foreign_prop'] = round((grouped['foreign_pop'] / grouped['population']) * 100, 3)

    # calculate concentration of two minority language speakers
    grouped['est_con'] = round((grouped['estpop'] / grouped['estpop'].sum()) * 100, 3)
    grouped['som_con'] = round((grouped['sompop'] / grouped['sompop'].sum()) * 100, 3)

    # read diversity grid in
    divgrid = gpd.read_file(dp)

    # merge grouped and diversity data
    divgrid = pd.merge(divgrid, grouped[['euref_250', 'estpop', 'sompop',
                                         'finpop', 'swepop', 'finswe_pop',
                                         'foreign_pop', 'finswe_prop', 'est_prop',
                                         'som_prop', 'foreign_prop', 'est_con',
                                         'som_con']],
                       on='euref_250',
                       how='left')

    # output path
    o_path = args['output'] + 'HMA_langs_div_{}.gpkg'.format(str(i))

    # save geopackage
    print('[INFO] - Saving year {}...'.format(str(i)))
    divgrid.to_file(o_path, driver='GPKG')


print('[INFO] - ... done!')