含有章节索引的 *PUG 文章通用模板 ::-- hoxide [2006-04-29 09:12:35]
1. 代理服务器测试
简述 校园网上外网要money, 所以离不开proxy了, 不过网上n多proxy哪个最快呢? 好像有很多实现的软件, 但是用起来都不顺手, 既然会写程序, 为什么不自己写一个小工具?
1.1. 代码
1 import urllib
2 from HTMLParser import HTMLParser
3 from string import letters
4 import time
5
6 import pprint
7
8 def parserhtmllist(htmldata):
9 class MyHTMLParser(HTMLParser):
10 def set(self):
11 self.S = 'none'
12 self.I = []
13 self.l = 0
14 def handle_starttag(self, tag, attrs):
15 if ('class', 'cells') in attrs \
and tag == 'tr' :
16 self.l = 0
17 self.S = 'cell'
18 self.I.append([])
19 if self.S == 'cell' and tag =='td':
20 self.l += 1
21 self.S = 'celltd'
22 def handle_endtag(self, tag):
23 if self.S == 'celltd' and tag == 'td':
24 self.l -= 1
25 if self.l == 0:
26 self.S = 'cell'
27 if self.S == 'cell' and tag == 'tr':
28 self.S = 'none'
29 def handle_data(self,data):
30 if self.S == 'celltd' and self.l >=1:
31 self.I[-1].append(data)
32 def getlist(self):
33 I = []
34 for x in self.I:
35 try:
36 int(x[0])
37 I.append((x[1],x[2]))
38 except:
39 pass
40 return I
41
42 p = MyHTMLParser()
43 p.set()
44 p.feed(htmldata)
45 p.close()
46 return p.getlist()
47
48 def getproxylist(proxylisturl, testurls
49 , proxies = {},maxtime=20, debug=True):
50 opener = urllib.FancyURLopener(proxies)
51 data = opener.open(proxylisturl).read()
52 I = parserhtmllist(data)
53 TI = []
54 for server,port in I:
55 proxy = {'http':'http://%s:%s'%(server,port)}
56 opener = urllib.FancyURLopener(proxies)
57 TI.append([])
58 if debug:
59 print 'testing %s:%s'%(server,port)
60 for url in testurls:
61 try:
62 st = time.time()
63 filehandle = opener.open(url)
64 et = time.time()
65 TI[-1].append(et-st)
66 except IOError:
67 TI[-1].append(maxtime)
68 return zip(I,TI)
69
70 if __name__ == '__main__':
71 proxylisturl = 'http://www.haozs.net/proxyip/index.php?' \
'act=list&port=&type=&country=China&page=1'
72 #proxylisturl = 'a.html'
73
74 testurls = ['http://www.google.com']
75
76 M = getproxylist(proxylisturl, testurls)
77
78
79 M.sort(key=lambda x: sum(x[1]))
80
81 print '\nResult(sorted):'
82 for x in M:
83 print '%s:%s\t%g'%(x[0][0],x[0][1],sum(x[1]))
1.2. 分析
Html的处理上有点麻烦, 用了HTMLParser, 相信熟悉Parser的同学们肯定都明白.