collabworks/collabworks.py at master · arnauqc/collabworks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import pandas as pd
import networkx as nx
import time
import sys
import glob
import numpy as np
from unidecode import unidecode


def get_search_engine_parameters(search_engine, nodes_size):
    """
    Obtain the WoS/Scopus parameters
    :param search_engine: String, 's': Scopus / 'w': WoS
    :param nodes_size: String, 'c': Nodes size represents citations/author / 'a': Nodes size represents articles/author
    :return: Dictionary with the selected engine parameters
    """

    if search_engine == 's':
        search_engine_dict = {'authors_name_column_id': 'Authors', 'authors_separator': ';', 'engine_name': 'Scopus',
                              'unique_articles_column_id': 'EID',
                              'data_files_format': 'csv', 'data_files_column_separator': '',
                              'number_citations_column_id': 'Cited by'}

        if nodes_size == 'a':
            search_engine_dict['node_size_mode'] = 'a'
        else:
            search_engine_dict['node_size_mode'] = 'c'
    else:
        search_engine_dict = {'authors_name_column_id': 'AU', 'unique_articles_column_id': 'UT',
                              'data_files_format': 'txt', 'data_files_column_separator': '\t',
                              'authors_separator': ';',  'number_citations_column_id': 'TC',
                              'engine_name': 'WoS'}

        if nodes_size == 'a':
            search_engine_dict['node_size_mode'] = 'a'
        else:
            search_engine_dict['node_size_mode'] = 'c'

    return search_engine_dict


def get_concat_df(engine_params):
    """
    Based on the tex/csv files placed on 'data' folder, the subroutine loads them and concatenates all the DF into a
    unique one. It also drops duplicated publications based on the unique identifier column.
    :param engine_params: Parameters dictionary
    :return: concatenated DF
    """
    all_files = glob.glob("./data/*." + engine_params['data_files_format'])
    df_list = []
    for file in all_files:
        if engine_params['engine_name'] == 'WoS':
            part_df = pd.read_csv(file, sep=engine_params['data_files_column_separator'], encoding='utf-8-sig', index_col=False,
                              header=0)
        else:
            part_df = pd.read_csv(file, encoding='utf-8-sig', index_col=False, header=0)
        df_list.append(part_df)

    # Concatenate DataFrames specified in the list
    df = pd.concat(df_list)
    try:
        df = pd.concat(df_list)
    except ValueError:
        print('\n\n (!) No objects to concatenate. Data folder is empty. \n')
        exit('End of execution')
    # Drop duplicates based on the WOS Unique Identifier or the Scopus EID.
    filtered_df = df.drop_duplicates(engine_params['unique_articles_column_id'])
    return filtered_df


def print_progress(iteration, total, prefix='', suffix='', decimals=2, bar_length=60):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : number of decimals in percent complete (Int)
        barLength   - Optional  : character length of bar (Int)
    """
    filled_length = int(round(bar_length * iteration / float(total)))
    percents = round(100.00 * (iteration / float(total)), decimals)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),
    sys.stdout.flush()
    if iteration == total:
        sys.stdout.write('\n')
        sys.stdout.flush()


def authors_format(ds, engine_params):
    """
    Sets the appropriate name format for all authors in a Serie
    :param ds: Series to be formatted
    :param engine_params: Parameters dictionary
    :return: Formatted Series
    """
    # . Removing parentheses in order to avoid regex matching groups
    if engine_params['engine_name'] == 'WoS':
        ds = ds.apply(lambda element: unidecode(str(element).upper().replace(' ', '').replace(',', ', ')
                                                .replace('(', ' ').replace(')', '').replace('.','')))
    else:
        ds = ds.apply(lambda element: unidecode(str(element).upper().replace(' ', '')
                      .replace('(', ' ').replace(')', '').replace('.,','.;').replace('.','')))
    return ds


def populate_adj_matrix(ds, authors_set_list, engine_params):
    """
    This module initializes and populates and adjacency matrix (DataFrame) based on the rows of a Serie, which contains
    in each row the set of authors co-involved in a scientific publication.

    :param ds: Co-autorship Series
    :param authors_set_list: Set of authors contained in co-autorship Serie
    :param engine_params: Dict containing all search engine parameters
    :return: Adjacency Matrix (DataFrame)
    """
    print("\n3) Building network...\n")
    # Get number of authors
    num_authors = len(authors_set_list)
    # Create Adjacency matrix. Array of 0s of len². Indexes and columns has author's names
    adj_df = pd.DataFrame([[0]*num_authors]*num_authors, columns=authors_set_list, index=authors_set_list)
    # Index progress bar
    i = 0
    for author_name in authors_set_list:
        # Filter Series for author name. Regex matching. Obtain a Series in which rows contains the author's name
        author_collaborators_ds = ds[ds.str.contains(author_name + '(?:$|\W)')]
        # Split each row using the appropriate separator
        split_author_collaborators_ds = author_collaborators_ds.str.split(engine_params['authors_separator'])
        # Obtain each author for each coauthored publication. Once obtained the set of authors, calculate the histogram
        value_counts_author_collaborations_ds = pd.Series([author
                                                           for coauthors_list in split_author_collaborators_ds
                                                           for author in coauthors_list]).value_counts()
        # Drop author name from the histogram Series
        value_counts_author_collaborations_ds.drop(author_name, inplace=True)
        # Locate the coauthorship histogram as a subset of the adjacency matrix
        adj_df.loc[author_name, value_counts_author_collaborations_ds.index] = value_counts_author_collaborations_ds
        # Step in progress bar
        i += 1
        print_progress(i, num_authors)

    return adj_df


def get_adjacency_df(main_df, min_weight, engine_params):
    """
    Intermediate subroutine. Loads dictionaries and other components.
    :param main_df: Main DataFrame. Contains all the article's information
    :param min_weight: Minimum number of articles that some author must have in order to appear in the graph
    :param engine_params: Dictionary of parameters
    :return:
    """
    print("\n2) Building authors sets...")
    # Select column which contains the Authors names
    ds = main_df[engine_params['authors_name_column_id']]
    ds = authors_format(ds, engine_params)
    # Number of articles within the current Series
    num_articles = len(ds)
    # List of all appearances of authors names
    all_authors_appearance_list = [author for row in ds for author in row.split(engine_params['authors_separator'])]
    # Series all appearances of authors names
    all_authors_appearance_ds = pd.Series(all_authors_appearance_list)
    # Value counts (histogram) of all_authors_appearance_ds
    authors_histogram_ds = all_authors_appearance_ds.value_counts()
    # Set of authors names
    authors_set = set(all_authors_appearance_list)
    # List of authors_set
    authors_set_list = list(authors_set)

    # Obtain the adjacency DF based on the co-authorship Series
    adj_df = populate_adj_matrix(ds, authors_set_list, engine_params)

    # Filter the authors of the adjacency matrix based on the number of publication.
    # The min weight represents the number of minimum publications
    # some author ought have not to be dropped from the adjacency matrix.
    author_to_be_deleted_list = list(authors_histogram_ds[authors_histogram_ds < min_weight].index)
    # Drop the authors to be deleted from the adjacency matrix.
    # First from axis 0 (horizontal) and the from the vertical (1) one
    adj_df.drop(author_to_be_deleted_list, axis=0)
    adj_df.drop(author_to_be_deleted_list, axis=1)
    # Delete authors which are less relevant than imposed by the threshold
    authors_set_list = [author for author in authors_set_list if author not in author_to_be_deleted_list]
    # Order columns and indexes in the same authors order
    adj_df = adj_df.ix[authors_set_list, authors_set_list]

    return adj_df, authors_set_list, authors_histogram_ds


def get_num_articles(G, number_of_articles_per_author_ds, authors_set_list):
    """
    Given a NetowrkX graph G, it obtains the nodes size based on the number of articles per author
    :param G: Graph
    :param number_of_articles_per_author_ds: Series containing the number of articles per author
    :param authors_set_list: Set of articles within the graph
    :return: Graph with property size
    """
    # Maximum number of publications found in the Dataframe
    x_max = number_of_articles_per_author_ds.max()
    # Maximum size of the nodes
    size_max = 13
    k = (x_max/size_max)
    for i in range(len(authors_set_list)):
        # Set a relation between node identifier and author name
        G.node[i]['Label'] = authors_set_list[i]
        # Set the number of articles normalized as the node size property
        # x_i: Number of publications of the author
        x_i = number_of_articles_per_author_ds[authors_set_list[i]]
        if x_i >= k:
            G.node[i]['size'] = str(int(x_i/k + k))
        else:
            G.node[i]['size'] = str(x_i + 1)

    return G


def get_citations_author(G, main_df, authors_set_list, engine_params):
    """
    Given a NetowrkX graph G, it obtains the nodes size based on the number of citations per author
    :param G: Graph
    :param main_df: Main DataFrame. Contains all the article's information
    :param authors_set_list: Set of articles within the graph
    :param engine_params: Parameters dictionary
    :return: Graph with property size
    """
    main_df[engine_params['authors_name_column_id']] = \
        authors_format(main_df[engine_params['authors_name_column_id']], engine_params)
    # Fill NaN with 0 (Scopus considers NAN elements as 0 citations)
    main_df[engine_params['number_citations_column_id']].fillna(0, inplace=True)
    for i in range(len(authors_set_list)):
        # Return authors name from the tuple given an index i
        author_name = authors_set_list[i]
        # Obtain a new DataFrame based on the author name appearance (as a substring) on the authors name column
        substring_researcher_df = \
            main_df[main_df[engine_params['authors_name_column_id']].str.contains(author_name + '(?:$|\W)')]
        # Obtain the total number of citations. Regex is used (\D, only digits will be kept).
        number_citations_author = substring_researcher_df[engine_params['number_citations_column_id']].astype('int').sum()

        # Set a relation between node identifier and author name
        G.node[i]['Label'] = author_name
        # Set the number of articles normalized as the node size property. The normalisation function will be a 1/4
        # exponent power function.
        # c_i: Normalized Number of citations of the author
        c_i = int(np.power(number_citations_author, 0.25)) + 1
        G.node[i]['size'] = str(c_i)

    return G


def export_graph(main_df, publications_threshold, engine_params):
    """
    Generates and exports the obtained graph.
    :param main_df: Main DataFrame. Contains all the article's information
    :param publications_threshold: Constant
    :param engine_params: Parameters dictionary
    :return: GraphML containing the graph. Exported in the execution directory.
    """
    # Lets begin dropping all NAN from authors names column and reset index
    main_df.dropna(subset=[engine_params['authors_name_column_id']], inplace=True)
    main_df.reset_index(inplace=True)

    # Obtain the adjacency matrix over a Pandas DF
    authors_adjacency_df, authors_set, number_of_articles_per_author_ds = \
        get_adjacency_df(main_df, publications_threshold, engine_params)

    # Based on the generated adjacency matrix, obtain the networkX graph
    G = nx.from_numpy_matrix(authors_adjacency_df.values)

    # NODES SIZE BASED ON THE NUMBER OF CITATIONS PER AUTHOR
    if engine_params['node_size_mode'] == 'c':
        print('\n4) Calculating nodes sizes based on the # citations per author...')
        G = get_citations_author(G, main_df, authors_set, engine_params)
        # Proceed to export the obtained networkX graph to a GraphML network
        print("\n5) Exporting network...\n")
        nx.write_graphml(G, 'Graph [' + engine_params['engine_name'] + ' - Threshold ' + str(publications_threshold) +
                         ' - # Citations].graphml')

    else:
        # NODES SIZE BASED ON THE NUMBER OF PUBLICATIONS PER AUTHOR
        print('\n4) Calculating nodes sizes based on the # articles per author...\n')
        G = get_num_articles(G, number_of_articles_per_author_ds, authors_set)
        # Proceed to export the obtained networkX graph to a GraphML network
        print("5) Exporting network...\n")
        nx.write_graphml(G, 'Graph [' + engine_params['engine_name'] + ' - Threshold ' + str(publications_threshold) +
                         ' - # Articles].graphml')


if __name__ == "__main__":
    print("\n\nCOLLABWORKS")
    print("Python WoS/Scopus collaboration networks tool\n")
    args = sys.argv[1:]
    print('\n -- Network properties -- ')
    if args:
        weight_threshold = [item for item in args if item.isdigit()]
        if weight_threshold:
            weight_threshold = int(weight_threshold[0])
            print("   - Publications threshold equal to", weight_threshold)
        else:
            weight_threshold = 1
            print("   - No publication threshold specified. It will be set equal to 1.")
        if '-s' in args:
            # If Scopus is the search engine
            print("   - Using Scopus as default scientific DataBase")
            if '-a' in args:
                print('   - Nodes size will be calculated based on the number of articles per author')
                engine_params_dict = get_search_engine_parameters('s', 'a')
            else:
                print('   - Nodes size will be calculated based on the number of citacions per author')
                engine_params_dict = get_search_engine_parameters('s', 'c')

        else:
            # If WoS is the search engine
            print("   - Using WoS as default scientific DataBase")
            if '-a' in args:
                print('   - Nodes size will be calculated based on the number of articles per author')
                engine_params_dict = get_search_engine_parameters('w', 'a')
            else:
                print('   - Nodes size will be calculated based on the number of citacions per author')
                engine_params_dict = get_search_engine_parameters('w', 'c')

    else:
        print('   - Publications threshold unset.\n'
              '   - WoS set as default search engine\n'
              '   - Nodes size will be calculated based on the number of citacions per author\n')
        weight_threshold = 1
        engine_params_dict = get_search_engine_parameters('w', 'c')

    tic = time.time()

    print("\n\n1) Building concatenated database...")
    df = get_concat_df(engine_params_dict)
    export_graph(df, weight_threshold, engine_params_dict)

    tac = time.time()
    time_needed = int(tac - tic)
    m, s = divmod(time_needed, 60)
    h, m = divmod(m, 60)
    string_time = str(int(h)) + 'h ' + str(int(m)) + 'm ' + str(int(s)) + 's'
    print('\nExecution time: ' + string_time)