collabgraph/cgraph.py at master · shackenberg/collabgraph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import re
import pygraphviz as pgv
from datetime import date

class Paper:
  def __init__(self):
    self.id = 0
    self.authors=list()

  def addAuthor(self,author):
    self.authors.append(author)

  def getAuthorId(self,author):
    return self.authors.index(author)

  def updateAuthor(self,oldauthor,newauthor):
    self.authors.pop(self.authors.index(oldauthor))
    self.authors.append(newauthor)

  def getAuthors(self):
    return self.authors


class CGraph():
  def __init__(self):
    self.gr=pgv.AGraph()
    self.authors={}
    self.papers = list()
    self.paper_counter = 0
    self.connections={}
    self.filename = ""
    self.verbose = True
    self.web = False

  def run(self,filecontent, outputfilename = None):
    self.extractAuthors_fromBib(filecontent)
    self.drawNodes()
    self.drawGraph(outputfilename)

  def run_with_mendeley_data(self, outputfilename = None):
    mendeley=self.auth_mendeley()
    self.extractAuthors_fromMendeley(mendeley)
    self.drawNodes()
    self.drawGraph(outputfilename)

  def extractAuthors_fromBib(self,filecontent):
    # go through the lines to find the authors of
    # each paper (/ entry in the bibtex file)
    for inputline in filecontent:
      line = inputline.lstrip()
      keyword = 'author ='
      pattern = re.compile('\s*(author)\s*(=)',re.IGNORECASE)
      line = re.sub(pattern,keyword, line)
      if line[:len(keyword)].lower() == keyword:
        line = self.sanitizeLine(line)
        current_paper = Paper()
        self.paper_counter = self.paper_counter+1
        paper_authors = re.split(" and ",line[len(keyword):])
        for name in paper_authors:
            current_paper.id = self.paper_counter
            author = self.matchNames(name)
            if (self.validateAuthor(author) == 1):
                current_paper.addAuthor(author)
        self.papers.append(current_paper)

  def auth_mendeley(self):
    from mendeley_client import MendeleyClient
    mendeley = MendeleyClient(XXXXXXX, XXXXXXX) #get from dev.mendeley.com
    try:
        mendeley.load_keys()
    except IOError:
        mendeley.get_required_keys()
        mendeley.save_keys()
    return mendeley

  def extractAuthors_fromMendeley(self,mendeley):
    if self.verbose: print 'getting data from Mendeley, please wait'
    num_documents = mendeley.library()[u'total_results']
    documents = mendeley.library(items=num_documents)
    profilename = mendeley.profile_info('me')[u'main'][u'name']
    self.filename = profilename + '\'s library @ mendeley'
    paper_ids= documents[u'document_ids']
    for paper_id in paper_ids:
        paper_details = mendeley.document_details(paper_id)
        paper_authors = paper_details["authors"]
        self.paper_counter = self.paper_counter+1
        current_paper = Paper()
        for name in paper_authors:
            current_paper.id = self.paper_counter
            author = name['surname'] + ', ' + name['forename']
            author = self.matchNames(author)
            if (self.validateAuthor(author) == 1):
                current_paper.addAuthor(author)
        self.papers.append(current_paper)

  def findnextAuthor(self,line,start,end,pivot):
      # finds the next author in the given text line
      pivot = line.lower().find(' and ',pivot+1,end)
      if pivot == -1:
        pivot = end
      author = line[start+1:pivot].lstrip().rstrip('"')
      start = pivot + 4
      return author, pivot, start


  def matchNames(self,author):
    # checks if the author is already known with an more complete name
    # i.e. substitues B. Gates with Bill Gates
    if author.find(',') == -1:
      for i in  range(len(author[author.find(' ')+1:]),-2,-1):
        temp_name = author[0:len(author)-author.find(' ')-i] + ' ' +author[author.find(' '):].lstrip()
        for paper in self.papers:
          if temp_name in paper.authors:
            paper.updateAuthor(temp_name,author)
      return author
    else:
      for i in range(len(author[author.find(',')+0:]),2,-1):
        temp_name=author[:author.find(',')+i]
        for paper in self.papers:
          if temp_name in paper.authors:
            paper.updateAuthor(temp_name,author)
    return author


  def sanitizeLine(self,line):
    line = ' '.join(line.split()) # remove tabs and double spaces see [1]
    try:
      line = line.decode('utf-8')
    except:
      line = line.decode('cp1252')
    if line[-1] == ',':
      line = line[:-1]
    if re.match(re.compile('(author =)\s*(\").*(\")'),line) != None:
      line = re.sub('(author =)\s*(\")','author =',line)
    line = re.sub('[{}]','',line)
    line = re.sub('\~',u' ',line)
    line = re.sub('\"a',u'ä',line)
    line = re.sub('\"A',u'Ä',line)
    line = re.sub('\"o',u'ö',line)
    line = re.sub('\"O',u'Ö',line)
    line = re.sub('\"u',u'ü',line)
    line = re.sub('\"U',u'Ü',line)
    line = re.sub('\"s',u'ß',line)
    line = re.sub('\\ss',u'ß',line)
    line = re.sub('"','',line)
    # necessary to remove spaces at the end of the string (don't ask)
    line = ' '.join(line.split()) # remove tabs and double spaces see [1]
    return line


  def validateAuthor(self,author):
    # very rudementary function to get rid not usefull author names
    if author == '':
      return 0
    if author.lower() == "others":
      return 0
    if author.lower() == "et al":
      return 0
    return 1


  def drawNodes(self):
    for paper in self.papers:
      if len(paper.authors) == 1:
        self.gr.add_node(paper.authors[0])
      for n in range(len(paper.authors)-1):
        for m in range(n+1,len(paper.authors)):
          authorA = paper.authors[n]
          authorB = paper.authors[m]
          author_pair = ''.join(sorted((paper.authors[n],paper.authors[m])))
          # checks if no connection already exists
          # if connection exists connection is deleted and redrawn with
          # thicker line
          if self.connections.has_key(author_pair) == 0:
            self.connections[author_pair] = 1
            self.gr.add_edge((authorA,authorB))
          else:
            self.connections[author_pair] =  self.connections[author_pair] + 1
            self.gr.delete_edge(authorA,authorB)
            self.gr.add_edge((authorA,authorB),penwidth=str(self.connections[author_pair]))

  def drawGraph(self, outputfilename = None):
    if outputfilename is None:
        outputfilename = self.filename
    self.gr.node_attr['fontname']='Helvetica'
    label = 'Co-authorship graph for ' + self.filename + ' by Collabgraph - ' + str(date.today())
    self.gr.graph_attr['label']= label
    self.gr.graph_attr['fontname']='Helvetica'
    self.gr.graph_attr['overlap']='Prism'
    self.gr.layout(prog='neato')
    self.image_filename = outputfilename + ".svg"
    self.gr.draw(self.image_filename)

    # uncomment to write dot files
    self.gr.write(outputfilename+'.dot')
    if self.verbose: print "output: ", self.image_filename


if __name__ == "__main__":
  import sys
  try:
    argument = sys.argv[1]
  except:
    print 'please give file name or enter -m for Mendeley'
    sys.exit()

  ## use mendeley
  if argument == '-m':
    graph = CGraph()
    graph.filename = 'mendeley'
    graph.run_with_mendeley_data()
  else:

  ## use bibtex file
    filename = argument
    print 'opening file ', filename
    with open(filename,'r') as filedata:
      graph = CGraph()
      graph.filename = filename
      filedata = open(filename)
      graph.run(filedata)
    filedata.closed

#[1]  http://stackoverflow.com/questions/4241757/python-django-how-to-remove-extra-white-spaces-tabs-from-a-string