forked from ashkonf/pagerank
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpagerank.py
More file actions
75 lines (58 loc) · 2.23 KB
/
pagerank.py
File metadata and controls
75 lines (58 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import sys
import math
import numpy
import pandas
# Generalized matrix operations:
def __extractNodes(matrix):
nodes = set()
for colKey in matrix:
nodes.add(colKey)
for rowKey in matrix.T:
nodes.add(rowKey)
return nodes
def __makeSquare(matrix, keys, default=0.0):
matrix = matrix.copy()
def insertMissingColumns(matrix):
for key in keys:
if not key in matrix:
matrix[key] = pandas.Series(default, index=matrix.index)
return matrix
matrix = insertMissingColumns(matrix) # insert missing columns
matrix = insertMissingColumns(matrix.T).T # insert missing rows
return matrix.fillna(default)
def __ensureRowsPositive(matrix):
matrix = matrix.T
for colKey in matrix:
if matrix[colKey].sum() == 0.0:
matrix[colKey] = pandas.Series(numpy.ones(len(matrix[colKey])), index=matrix.index)
return matrix.T
def __normalizeRows(matrix):
return matrix.div(matrix.sum(axis=1), axis=0)
def __euclideanNorm(series):
return math.sqrt(series.dot(series))
# PageRank specific functionality:
def __startState(nodes):
if len(nodes) == 0: raise ValueError("There must be at least one node.")
startProb = 1.0 / float(len(nodes))
return pandas.Series({node : startProb for node in nodes})
def __integrateRandomSurfer(nodes, transitionProbs, rsp):
alpha = 1.0 / float(len(nodes)) * rsp
return transitionProbs.copy().multiply(1.0 - rsp) + alpha
def powerIteration(transitionWeights, rsp=0.15, epsilon=0.00001, maxIterations=1000):
# Clerical work:
transitionWeights = pandas.DataFrame(transitionWeights)
nodes = __extractNodes(transitionWeights)
transitionWeights = __makeSquare(transitionWeights, nodes, default=0.0)
transitionWeights = __ensureRowsPositive(transitionWeights)
# Setup:
state = __startState(nodes)
transitionProbs = __normalizeRows(transitionWeights)
transitionProbs = __integrateRandomSurfer(nodes, transitionProbs, rsp)
# Power iteration:
for iteration in range(maxIterations):
oldState = state.copy()
state = state.dot(transitionProbs)
delta = state - oldState
if __euclideanNorm(delta) < epsilon: break
return state