-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcgraph.py
More file actions
executable file
·227 lines (198 loc) · 7.52 KB
/
cgraph.py
File metadata and controls
executable file
·227 lines (198 loc) · 7.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
import pygraphviz as pgv
from datetime import date
class Paper:
def __init__(self):
self.id = 0
self.authors=list()
def addAuthor(self,author):
self.authors.append(author)
def getAuthorId(self,author):
return self.authors.index(author)
def updateAuthor(self,oldauthor,newauthor):
self.authors.pop(self.authors.index(oldauthor))
self.authors.append(newauthor)
def getAuthors(self):
return self.authors
class CGraph():
def __init__(self):
self.gr=pgv.AGraph()
self.authors={}
self.papers = list()
self.paper_counter = 0
self.connections={}
self.filename = ""
self.verbose = True
self.web = False
def run(self,filecontent, outputfilename = None):
self.extractAuthors_fromBib(filecontent)
self.drawNodes()
self.drawGraph(outputfilename)
def run_with_mendeley_data(self, outputfilename = None):
mendeley=self.auth_mendeley()
self.extractAuthors_fromMendeley(mendeley)
self.drawNodes()
self.drawGraph(outputfilename)
def extractAuthors_fromBib(self,filecontent):
# go through the lines to find the authors of
# each paper (/ entry in the bibtex file)
for inputline in filecontent:
line = inputline.lstrip()
keyword = 'author ='
pattern = re.compile('\s*(author)\s*(=)',re.IGNORECASE)
line = re.sub(pattern,keyword, line)
if line[:len(keyword)].lower() == keyword:
line = self.sanitizeLine(line)
current_paper = Paper()
self.paper_counter = self.paper_counter+1
paper_authors = re.split(" and ",line[len(keyword):])
for name in paper_authors:
current_paper.id = self.paper_counter
author = self.matchNames(name)
if (self.validateAuthor(author) == 1):
current_paper.addAuthor(author)
self.papers.append(current_paper)
def auth_mendeley(self):
from mendeley_client import MendeleyClient
mendeley = MendeleyClient(XXXXXXX, XXXXXXX) #get from dev.mendeley.com
try:
mendeley.load_keys()
except IOError:
mendeley.get_required_keys()
mendeley.save_keys()
return mendeley
def extractAuthors_fromMendeley(self,mendeley):
if self.verbose: print 'getting data from Mendeley, please wait'
num_documents = mendeley.library()[u'total_results']
documents = mendeley.library(items=num_documents)
profilename = mendeley.profile_info('me')[u'main'][u'name']
self.filename = profilename + '\'s library @ mendeley'
paper_ids= documents[u'document_ids']
for paper_id in paper_ids:
paper_details = mendeley.document_details(paper_id)
paper_authors = paper_details["authors"]
self.paper_counter = self.paper_counter+1
current_paper = Paper()
for name in paper_authors:
current_paper.id = self.paper_counter
author = name['surname'] + ', ' + name['forename']
author = self.matchNames(author)
if (self.validateAuthor(author) == 1):
current_paper.addAuthor(author)
self.papers.append(current_paper)
def findnextAuthor(self,line,start,end,pivot):
# finds the next author in the given text line
pivot = line.lower().find(' and ',pivot+1,end)
if pivot == -1:
pivot = end
author = line[start+1:pivot].lstrip().rstrip('"')
start = pivot + 4
return author, pivot, start
def matchNames(self,author):
# checks if the author is already known with an more complete name
# i.e. substitues B. Gates with Bill Gates
if author.find(',') == -1:
for i in range(len(author[author.find(' ')+1:]),-2,-1):
temp_name = author[0:len(author)-author.find(' ')-i] + ' ' +author[author.find(' '):].lstrip()
for paper in self.papers:
if temp_name in paper.authors:
paper.updateAuthor(temp_name,author)
return author
else:
for i in range(len(author[author.find(',')+0:]),2,-1):
temp_name=author[:author.find(',')+i]
for paper in self.papers:
if temp_name in paper.authors:
paper.updateAuthor(temp_name,author)
return author
def sanitizeLine(self,line):
line = ' '.join(line.split()) # remove tabs and double spaces see [1]
try:
line = line.decode('utf-8')
except:
line = line.decode('cp1252')
if line[-1] == ',':
line = line[:-1]
if re.match(re.compile('(author =)\s*(\").*(\")'),line) != None:
line = re.sub('(author =)\s*(\")','author =',line)
line = re.sub('[{}]','',line)
line = re.sub('\~',u' ',line)
line = re.sub('\"a',u'ä',line)
line = re.sub('\"A',u'Ä',line)
line = re.sub('\"o',u'ö',line)
line = re.sub('\"O',u'Ö',line)
line = re.sub('\"u',u'ü',line)
line = re.sub('\"U',u'Ü',line)
line = re.sub('\"s',u'ß',line)
line = re.sub('\\ss',u'ß',line)
line = re.sub('"','',line)
# necessary to remove spaces at the end of the string (don't ask)
line = ' '.join(line.split()) # remove tabs and double spaces see [1]
return line
def validateAuthor(self,author):
# very rudementary function to get rid not usefull author names
if author == '':
return 0
if author.lower() == "others":
return 0
if author.lower() == "et al":
return 0
return 1
def drawNodes(self):
for paper in self.papers:
if len(paper.authors) == 1:
self.gr.add_node(paper.authors[0])
for n in range(len(paper.authors)-1):
for m in range(n+1,len(paper.authors)):
authorA = paper.authors[n]
authorB = paper.authors[m]
author_pair = ''.join(sorted((paper.authors[n],paper.authors[m])))
# checks if no connection already exists
# if connection exists connection is deleted and redrawn with
# thicker line
if self.connections.has_key(author_pair) == 0:
self.connections[author_pair] = 1
self.gr.add_edge((authorA,authorB))
else:
self.connections[author_pair] = self.connections[author_pair] + 1
self.gr.delete_edge(authorA,authorB)
self.gr.add_edge((authorA,authorB),penwidth=str(self.connections[author_pair]))
def drawGraph(self, outputfilename = None):
if outputfilename is None:
outputfilename = self.filename
self.gr.node_attr['fontname']='Helvetica'
label = 'Co-authorship graph for ' + self.filename + ' by Collabgraph - ' + str(date.today())
self.gr.graph_attr['label']= label
self.gr.graph_attr['fontname']='Helvetica'
self.gr.graph_attr['overlap']='Prism'
self.gr.layout(prog='neato')
self.image_filename = outputfilename + ".svg"
self.gr.draw(self.image_filename)
# uncomment to write dot files
self.gr.write(outputfilename+'.dot')
if self.verbose: print "output: ", self.image_filename
if __name__ == "__main__":
import sys
try:
argument = sys.argv[1]
except:
print 'please give file name or enter -m for Mendeley'
sys.exit()
## use mendeley
if argument == '-m':
graph = CGraph()
graph.filename = 'mendeley'
graph.run_with_mendeley_data()
else:
## use bibtex file
filename = argument
print 'opening file ', filename
with open(filename,'r') as filedata:
graph = CGraph()
graph.filename = filename
filedata = open(filename)
graph.run(filedata)
filedata.closed
#[1] http://stackoverflow.com/questions/4241757/python-django-how-to-remove-extra-white-spaces-tabs-from-a-string