PyPageRankGuess -- 猜想实现 Google Page Rank ::-- ZoomQuiet [2006-12-19 09:51:41]

CPUG联盟::

CPUG::门户plone

BPUG

SPUG

ZPUG

SpreadPython Python宣传

1. 算法實作

1.1. Algorithm in 126 Lines

   1 # PageRank algorithm
   2 # By Peter Bengtsson
   3 #    http://www.peterbe.com/
   4 #    [email protected]
   5 #
   6 # Requires the numarray module
   7 # http://www.stsci.edu/resources/software_hardware/numarray
   8 
   9 from numarray import *
  10 import numarray.linear_algebra as la
  11 
  12 def _sum_sequence(seq):
  13     """ sums up a sequence """
  14     def _add(x,y): return x+y
  15     return reduce(_add, seq, 0)
  16 
  17 class PageRanker:
  18     def __init__(self, p, webmatrix):
  19         assert p>=0 and p <= 1
  20         self.p = float(p)
  21         if type(webmatrix)in [type([]), type(())]:
  22             webmatrix = array(webmatrix, Float)
  23         assert webmatrix.shape[0]==webmatrix.shape[1]
  24         self.webmatrix = webmatrix
  25 
  26         # create the deltamatrix
  27         imatrix = identity(webmatrix.shape[0], Float)
  28         for i in range(webmatrix.shape[0]):
  29             imatrix[i] = imatrix[i]*sum(webmatrix[i,:])
  30         deltamatrix = la.inverse(imatrix)
  31         self.deltamatrix = deltamatrix
  32 
  33         # create the fmatrix
  34         self.fmatrix = ones(webmatrix.shape, Float)
  35 
  36         self.sigma = webmatrix.shape[0]
  37 
  38         # calculate the Stochastic matrix
  39         _f_normalized = (self.sigma**-1)*self.fmatrix
  40         _randmatrix = (1-p)*_f_normalized
  41 
  42         _linkedmatrix = p * matrixmultiply(deltamatrix, webmatrix)
  43         M = _randmatrix + _linkedmatrix
  44 
  45         self.stochasticmatrix = M
  46 
  47         self.invariantmeasure = ones((1, webmatrix.shape[0]), Float)
  48 
  49 
  50     def improve_guess(self, times=1):
  51         for i in range(times):
  52             self._improve()
  53 
  54     def _improve(self):
  55         self.invariantmeasure = matrixmultiply(self.invariantmeasure, self.stochasticmatrix)
  56 
  57     def get_invariant_measure(self):
  58         return self.invariantmeasure
  59 
  60     def getPageRank(self):
  61         sum = _sum_sequence(self.invariantmeasure[0])
  62         copy = self.invariantmeasure[0]
  63         for i in range(len(copy)):
  64             copy[i] = copy[i]/sum
  65         return copy
  66 
  67 if __name__=='__main__':
  68     # Example usage
  69     web = ((0, 1, 0, 0),
  70            (0, 0, 1, 0),
  71            (0, 0, 0, 1),
  72            (1, 0, 0, 0))
  73 
  74     pr = PageRanker(0.85, web)
  75 
  76     pr.improve_guess(100)
  77     print pr.getPageRank()

1.2. PageRank-in-Python

   1 #!/usr/bin/env python
   2 
   3 from numpy import *
   4 
   5 def pageRankGenerator(
   6         At = [array((), int32)],
   7         numLinks = array((), int32),
   8         ln = array((), int32),
   9         alpha = 0.85,
  10         convergence = 0.01,
  11         checkSteps = 10
  12         ):
  13         """
  14         Compute an approximate page rank vector of N pages to within some convergence factor.
  15         @param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii.
  16         @param numLinks iNumLinks[ii] is the number of links going out from ii.
  17         @param ln contains the indices of pages without links
  18         @param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links.
  19         @param convergence a relative convergence criterion. smaller means better, but more expensive.
  20         @param checkSteps check for convergence after so many steps
  21         """
  22 
  23         # the number of "pages"
  24         N = len(At)
  25 
  26         # the number of "pages without links"
  27         M = ln.shape[0]
  28 
  29         # initialize: single-precision should be good enough
  30         iNew = ones((N,), float32) / N
  31         iOld = ones((N,), float32) / N
  32 
  33         done = False
  34         while not done:
  35 
  36                 # normalize every now and then for numerical stability
  37                 iNew /= sum(iNew)
  38 
  39                 for step in range(checkSteps):
  40 
  41                         # swap arrays
  42                         iOld, iNew = iNew, iOld
  43 
  44                         # an element in the 1 x I vector.
  45                         # all elements are identical.
  46                         oneIv = (1 - alpha) * sum(iOld) / N
  47 
  48                         # an element of the A x I vector.
  49                         # all elements are identical.
  50                         oneAv = 0.0
  51                         if M > 0:
  52                                 oneAv = alpha * sum(iOld.take(ln, axis = 0)) * M / N
  53 
  54                         # the elements of the H x I multiplication
  55                         ii = 0
  56                         while ii < N:
  57                                 page = At[ii]
  58                                 h = 0
  59                                 if page.shape[0]:
  60                                         h = alpha * dot(
  61                                                 iOld.take(page, axis = 0),
  62                                                 1. / numLinks.take(page, axis = 0)
  63                                                 )
  64                                 iNew[ii] = h + oneAv + oneIv
  65                                 ii += 1
  66 
  67                 diff = iNew - iOld
  68                 done = (sqrt(dot(diff, diff)) / N < convergence)
  69 
  70                 yield iNew
  71 
  72 
  73 def transposeLinkMatrix(
  74         outGoingLinks = [[]]
  75         ):
  76         """
  77         Transpose the link matrix. The link matrix contains the pages each page points to.
  78         But what we want is to know which pages point to a given page, while retaining information
  79         about how many links each page contains (so store that in a separate array),
  80         as well as which pages contain no links at all (leaf nodes).
  81 
  82         @param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii
  83         @return a tuple of (incomingLinks, numOutGoingLinks, leafNodes)
  84         """
  85 
  86         nPages = len(outGoingLinks)
  87         # incomingLinks[ii] will contain the indices jj of the pages linking to page ii
  88         incomingLinks = [[] for ii in range(nPages)]
  89         # the number of links in each page
  90         numLinks = zeros(nPages, int32)
  91         # the indices of the leaf nodes
  92         leafNodes = []
  93 
  94         for ii in range(nPages):
  95                 if len(outGoingLinks[ii]) == 0:
  96                         leafNodes.append(ii)
  97                 else:
  98                         numLinks[ii] = len(outGoingLinks[ii])
  99                         # transpose the link matrix
 100                         for jj in outGoingLinks[ii]:
 101                                 incomingLinks[jj].append(ii)
 102 
 103         incomingLinks = [array(ii) for ii in incomingLinks]
 104         numLinks = array(numLinks)
 105         leafNodes = array(leafNodes)
 106 
 107         return incomingLinks, numLinks, leafNodes
 108 
 109 
 110 def pageRank(
 111         linkMatrix = [[]],
 112         alpha = 0.85,
 113         convergence = 0.01,
 114         checkSteps = 10
 115         ):
 116         """
 117         Convenience wrap for the link matrix transpose and the generator.
 118 
 119         @see pageRankGenerator for parameter description
 120         """
 121         incomingLinks, numLinks, leafNodes = transposeLinkMatrix(linkMatrix)
 122 
 123         for gr in pageRankGenerator(incomingLinks, numLinks, leafNodes, alpha = alpha, convergence = convergence, checkSteps = checkSteps):
 124                 final = gr
 125 
 126         return final

2. 反馈