PyPageRankGuess -- 猜想实现 Google Page Rank ::-- ZoomQuiet [2006-12-19 09:51:41]
1. 算法實作
Tsung's Blog -- Google PageRank 演算法實作(Python版)
1.1. Algorithm in 126 Lines
1 # PageRank algorithm
2 # By Peter Bengtsson
3 # http://www.peterbe.com/
4 # [email protected]
5 #
6 # Requires the numarray module
7 # http://www.stsci.edu/resources/software_hardware/numarray
8
9 from numarray import *
10 import numarray.linear_algebra as la
11
12 def _sum_sequence(seq):
13 """ sums up a sequence """
14 def _add(x,y): return x+y
15 return reduce(_add, seq, 0)
16
17 class PageRanker:
18 def __init__(self, p, webmatrix):
19 assert p>=0 and p <= 1
20 self.p = float(p)
21 if type(webmatrix)in [type([]), type(())]:
22 webmatrix = array(webmatrix, Float)
23 assert webmatrix.shape[0]==webmatrix.shape[1]
24 self.webmatrix = webmatrix
25
26 # create the deltamatrix
27 imatrix = identity(webmatrix.shape[0], Float)
28 for i in range(webmatrix.shape[0]):
29 imatrix[i] = imatrix[i]*sum(webmatrix[i,:])
30 deltamatrix = la.inverse(imatrix)
31 self.deltamatrix = deltamatrix
32
33 # create the fmatrix
34 self.fmatrix = ones(webmatrix.shape, Float)
35
36 self.sigma = webmatrix.shape[0]
37
38 # calculate the Stochastic matrix
39 _f_normalized = (self.sigma**-1)*self.fmatrix
40 _randmatrix = (1-p)*_f_normalized
41
42 _linkedmatrix = p * matrixmultiply(deltamatrix, webmatrix)
43 M = _randmatrix + _linkedmatrix
44
45 self.stochasticmatrix = M
46
47 self.invariantmeasure = ones((1, webmatrix.shape[0]), Float)
48
49
50 def improve_guess(self, times=1):
51 for i in range(times):
52 self._improve()
53
54 def _improve(self):
55 self.invariantmeasure = matrixmultiply(self.invariantmeasure, self.stochasticmatrix)
56
57 def get_invariant_measure(self):
58 return self.invariantmeasure
59
60 def getPageRank(self):
61 sum = _sum_sequence(self.invariantmeasure[0])
62 copy = self.invariantmeasure[0]
63 for i in range(len(copy)):
64 copy[i] = copy[i]/sum
65 return copy
66
67 if __name__=='__main__':
68 # Example usage
69 web = ((0, 1, 0, 0),
70 (0, 0, 1, 0),
71 (0, 0, 0, 1),
72 (1, 0, 0, 0))
73
74 pr = PageRanker(0.85, web)
75
76 pr.improve_guess(100)
77 print pr.getPageRank()
1.2. PageRank-in-Python
1 #!/usr/bin/env python
2
3 from numpy import *
4
5 def pageRankGenerator(
6 At = [array((), int32)],
7 numLinks = array((), int32),
8 ln = array((), int32),
9 alpha = 0.85,
10 convergence = 0.01,
11 checkSteps = 10
12 ):
13 """
14 Compute an approximate page rank vector of N pages to within some convergence factor.
15 @param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii.
16 @param numLinks iNumLinks[ii] is the number of links going out from ii.
17 @param ln contains the indices of pages without links
18 @param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links.
19 @param convergence a relative convergence criterion. smaller means better, but more expensive.
20 @param checkSteps check for convergence after so many steps
21 """
22
23 # the number of "pages"
24 N = len(At)
25
26 # the number of "pages without links"
27 M = ln.shape[0]
28
29 # initialize: single-precision should be good enough
30 iNew = ones((N,), float32) / N
31 iOld = ones((N,), float32) / N
32
33 done = False
34 while not done:
35
36 # normalize every now and then for numerical stability
37 iNew /= sum(iNew)
38
39 for step in range(checkSteps):
40
41 # swap arrays
42 iOld, iNew = iNew, iOld
43
44 # an element in the 1 x I vector.
45 # all elements are identical.
46 oneIv = (1 - alpha) * sum(iOld) / N
47
48 # an element of the A x I vector.
49 # all elements are identical.
50 oneAv = 0.0
51 if M > 0:
52 oneAv = alpha * sum(iOld.take(ln, axis = 0)) * M / N
53
54 # the elements of the H x I multiplication
55 ii = 0
56 while ii < N:
57 page = At[ii]
58 h = 0
59 if page.shape[0]:
60 h = alpha * dot(
61 iOld.take(page, axis = 0),
62 1. / numLinks.take(page, axis = 0)
63 )
64 iNew[ii] = h + oneAv + oneIv
65 ii += 1
66
67 diff = iNew - iOld
68 done = (sqrt(dot(diff, diff)) / N < convergence)
69
70 yield iNew
71
72
73 def transposeLinkMatrix(
74 outGoingLinks = [[]]
75 ):
76 """
77 Transpose the link matrix. The link matrix contains the pages each page points to.
78 But what we want is to know which pages point to a given page, while retaining information
79 about how many links each page contains (so store that in a separate array),
80 as well as which pages contain no links at all (leaf nodes).
81
82 @param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii
83 @return a tuple of (incomingLinks, numOutGoingLinks, leafNodes)
84 """
85
86 nPages = len(outGoingLinks)
87 # incomingLinks[ii] will contain the indices jj of the pages linking to page ii
88 incomingLinks = [[] for ii in range(nPages)]
89 # the number of links in each page
90 numLinks = zeros(nPages, int32)
91 # the indices of the leaf nodes
92 leafNodes = []
93
94 for ii in range(nPages):
95 if len(outGoingLinks[ii]) == 0:
96 leafNodes.append(ii)
97 else:
98 numLinks[ii] = len(outGoingLinks[ii])
99 # transpose the link matrix
100 for jj in outGoingLinks[ii]:
101 incomingLinks[jj].append(ii)
102
103 incomingLinks = [array(ii) for ii in incomingLinks]
104 numLinks = array(numLinks)
105 leafNodes = array(leafNodes)
106
107 return incomingLinks, numLinks, leafNodes
108
109
110 def pageRank(
111 linkMatrix = [[]],
112 alpha = 0.85,
113 convergence = 0.01,
114 checkSteps = 10
115 ):
116 """
117 Convenience wrap for the link matrix transpose and the generator.
118
119 @see pageRankGenerator for parameter description
120 """
121 incomingLinks, numLinks, leafNodes = transposeLinkMatrix(linkMatrix)
122
123 for gr in pageRankGenerator(incomingLinks, numLinks, leafNodes, alpha = alpha, convergence = convergence, checkSteps = checkSteps):
124 final = gr
125
126 return final