PyPageRankGuess -- 猜想实现 Google Page Rank ::-- ZoomQuiet [2006-12-19 09:51:41]

 CPUG联盟:: CPUG::门户plone BPUG SPUG ZPUG SpreadPython Python宣传

# 1. 算法實作

## 1.1. Algorithm in 126 Lines

```   1 # PageRank algorithm
2 # By Peter Bengtsson
3 #    http://www.peterbe.com/
4 #    mail@peterbe.com
5 #
6 # Requires the numarray module
7 # http://www.stsci.edu/resources/software_hardware/numarray
8
9 from numarray import *
10 import numarray.linear_algebra as la
11
12 def _sum_sequence(seq):
13     """ sums up a sequence """
16
17 class PageRanker:
18     def __init__(self, p, webmatrix):
19         assert p>=0 and p <= 1
20         self.p = float(p)
21         if type(webmatrix)in [type([]), type(())]:
22             webmatrix = array(webmatrix, Float)
23         assert webmatrix.shape==webmatrix.shape
24         self.webmatrix = webmatrix
25
26         # create the deltamatrix
27         imatrix = identity(webmatrix.shape, Float)
28         for i in range(webmatrix.shape):
29             imatrix[i] = imatrix[i]*sum(webmatrix[i,:])
30         deltamatrix = la.inverse(imatrix)
31         self.deltamatrix = deltamatrix
32
33         # create the fmatrix
34         self.fmatrix = ones(webmatrix.shape, Float)
35
36         self.sigma = webmatrix.shape
37
38         # calculate the Stochastic matrix
39         _f_normalized = (self.sigma**-1)*self.fmatrix
40         _randmatrix = (1-p)*_f_normalized
41
42         _linkedmatrix = p * matrixmultiply(deltamatrix, webmatrix)
43         M = _randmatrix + _linkedmatrix
44
45         self.stochasticmatrix = M
46
47         self.invariantmeasure = ones((1, webmatrix.shape), Float)
48
49
50     def improve_guess(self, times=1):
51         for i in range(times):
52             self._improve()
53
54     def _improve(self):
55         self.invariantmeasure = matrixmultiply(self.invariantmeasure, self.stochasticmatrix)
56
57     def get_invariant_measure(self):
58         return self.invariantmeasure
59
60     def getPageRank(self):
61         sum = _sum_sequence(self.invariantmeasure)
62         copy = self.invariantmeasure
63         for i in range(len(copy)):
64             copy[i] = copy[i]/sum
65         return copy
66
67 if __name__=='__main__':
68     # Example usage
69     web = ((0, 1, 0, 0),
70            (0, 0, 1, 0),
71            (0, 0, 0, 1),
72            (1, 0, 0, 0))
73
74     pr = PageRanker(0.85, web)
75
76     pr.improve_guess(100)
77     print pr.getPageRank()
```

## 1.2. PageRank-in-Python

```   1 #!/usr/bin/env python
2
3 from numpy import *
4
5 def pageRankGenerator(
6         At = [array((), int32)],
8         ln = array((), int32),
9         alpha = 0.85,
10         convergence = 0.01,
11         checkSteps = 10
12         ):
13         """
14         Compute an approximate page rank vector of N pages to within some convergence factor.
15         @param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii.
17         @param ln contains the indices of pages without links
18         @param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links.
19         @param convergence a relative convergence criterion. smaller means better, but more expensive.
20         @param checkSteps check for convergence after so many steps
21         """
22
23         # the number of "pages"
24         N = len(At)
25
26         # the number of "pages without links"
27         M = ln.shape
28
29         # initialize: single-precision should be good enough
30         iNew = ones((N,), float32) / N
31         iOld = ones((N,), float32) / N
32
33         done = False
34         while not done:
35
36                 # normalize every now and then for numerical stability
37                 iNew /= sum(iNew)
38
39                 for step in range(checkSteps):
40
41                         # swap arrays
42                         iOld, iNew = iNew, iOld
43
44                         # an element in the 1 x I vector.
45                         # all elements are identical.
46                         oneIv = (1 - alpha) * sum(iOld) / N
47
48                         # an element of the A x I vector.
49                         # all elements are identical.
50                         oneAv = 0.0
51                         if M > 0:
52                                 oneAv = alpha * sum(iOld.take(ln, axis = 0)) * M / N
53
54                         # the elements of the H x I multiplication
55                         ii = 0
56                         while ii < N:
57                                 page = At[ii]
58                                 h = 0
59                                 if page.shape:
60                                         h = alpha * dot(
61                                                 iOld.take(page, axis = 0),
62                                                 1. / numLinks.take(page, axis = 0)
63                                                 )
64                                 iNew[ii] = h + oneAv + oneIv
65                                 ii += 1
66
67                 diff = iNew - iOld
68                 done = (sqrt(dot(diff, diff)) / N < convergence)
69
70                 yield iNew
71
72
75         ):
76         """
77         Transpose the link matrix. The link matrix contains the pages each page points to.
78         But what we want is to know which pages point to a given page, while retaining information
79         about how many links each page contains (so store that in a separate array),
80         as well as which pages contain no links at all (leaf nodes).
81
82         @param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii
84         """
85
87         # incomingLinks[ii] will contain the indices jj of the pages linking to page ii
88         incomingLinks = [[] for ii in range(nPages)]
89         # the number of links in each page
91         # the indices of the leaf nodes
92         leafNodes = []
93
94         for ii in range(nPages):
96                         leafNodes.append(ii)
97                 else:
99                         # transpose the link matrix
102
105         leafNodes = array(leafNodes)
106
108
109
110 def pageRank(
112         alpha = 0.85,
113         convergence = 0.01,
114         checkSteps = 10
115         ):
116         """
117         Convenience wrap for the link matrix transpose and the generator.
118
119         @see pageRankGenerator for parameter description
120         """