代码见 http://zspy.googlecode.com
张沈鹏 zsp007@gmail.com http://zsp.javaeye.com/
2008-1-23 16:42
1. PycURL
Pycurl http://pycurl.sourceforge.net/
外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.
从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.
参考文献1,测试代码
1
2 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
3 import StringIO
4
5 html = StringIO.StringIO()
6
7 import pycurl
8 c = pycurl.Curl()
9
10 c.setopt(pycurl.URL, 'http://www.baidu.com')
11
12 #写的回调
13 c.setopt(pycurl.WRITEFUNCTION, html.write)
14
15 c.setopt(pycurl.FOLLOWLOCATION, 1)
16
17 #最大重定向次数,可以预防重定向陷阱
18 c.setopt(pycurl.MAXREDIRS, 5)
19
20 #访问,阻塞到访问结束
21 c.perform()
22
23 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
24 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
25
26 #输出百度首页的html
27 #print html.getvalue()
然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html
我自己改写了一个:)
1
2 #!/usr/bin/env python
3 #coding=utf-8
4
5 import threading
6 import pycurl
7 from cStringIO import StringIO
8
9 class UrlOpen(threading.Thread):
10 """异步下载网页"""
11
12 def __init__(self):
13 super(UrlOpen,self).__init__()
14 self.opener = pycurl.CurlMulti()
15 self.handle_list=[]
16
17 def add(self,url,recall,writer=StringIO()):
18 """
19 参数:网址,回调函数,存放临时数据的对象
20 """
21 c = pycurl.Curl()
22
23 #可以传给回调函数
24 c.url=url
25 c.content = writer
26 c.recall = recall
27 c.setopt(c.URL,url)
28 c.setopt(c.WRITEFUNCTION,c.content.write)
29
30 self.handle_list.append(c)
31 self.opener.add_handle(c)
32
33 def _remove(self,c):
34 c.close()
35 self.opener.remove_handle(c)
36 self.handle_list.remove(c)
37
38
39 def run(self):
40 num_handle=len(self.handle_list)
41 while 1:
42 ret = self.opener.select(10.0)
43 if ret == -1: continue
44 while 1:
45 num_handle_pre=num_handle
46 ret, num_handle =self.opener.perform()
47 #活动的连接数改变时
48 if num_handle!=num_handle_pre:
49 result=self.opener.info_read()
50 print result
51 for i in result[1]:
52 #成功
53 i.http_code = i.getinfo(i.HTTP_CODE)
54 self._remove(i)
55 i.recall(i)
56 for i in result[2]:
57 #失败,应该记录一下
58 self._remove(i)
59
60 if ret != pycurl.E_CALL_MULTI_PERFORM:
61 break
62
63 _opener=None
64 def urlopen(*arg,**key):
65 global _opener
66 if _opener is None:
67 _opener=UrlOpen()
68 _opener.add(*arg,**key)
69 _opener.start()
70 else:
71 _opener.add(*arg,**key)
72
73 def show(x):
74 print x.content.getvalue()
75 if __name__=="__main__":
76 urlopen("http://www.baidu.com/",show)
77 _opener.join()
又封装了一个异步打开网页的类和函数
1 #coding=utf-8
2
3 import threading
4 from cStringIO import StringIO
5
6 import pycurl
7 """
8 Asyn open url
9 Author:zsp007@gmail.com
10 2008-1-25 17:14
11 """
12
13 class UrlOpen(threading.Thread):
14 """异步下载网页"""
15
16 def __init__(self,):
17 super(UrlOpen,self).__init__()
18 self.opener = pycurl.CurlMulti()
19 self.handle_list=[]
20 self.waiting=[]
21
22 def add(self,url,recall,catch=None,writer=StringIO()):
23 """
24 参数:网址,回调函数,存放临时数据的对象
25 """
26 if catch is None:
27 def catch(curl,error_no,desp):
28 #print "Error:%s - %s"%(error_no,desp)
29 pass
30
31 c = pycurl.Curl()
32
33 #可以传给回调函数
34 c.url=url
35 c.content = writer
36 c.recall = recall
37 c.catch=catch
38 c.setopt(c.URL,
39 url.encode('utf-8') if type(url) is unicode else url
40 )
41 c.setopt(c.WRITEFUNCTION,c.content.write)
42
43 self.waiting.append(c)
44
45 def _add(self):
46 waiting=self.waiting[:]
47 self.waiting=[]
48 for c in waiting:
49 self.handle_list.append(c)
50 self.opener.add_handle(c)
51
52 def _remove(self,c):
53 c.close()
54 self.opener.remove_handle(c)
55 self.handle_list.remove(c)
56
57
58 def run(self):
59 import select
60 import time
61 num_handle=0
62 while 1:
63 if self.handle_list:
64 ret = self.opener.select(1.0)
65 if ret >= 0:
66 while 1:
67 num_handle_pre=num_handle
68 ret, num_handle =self.opener.perform()
69 #活动的连接数改变时
70 if num_handle!=num_handle_pre:
71 result=self.opener.info_read()
72 for i in result[1]:
73 #成功
74 i.http_code = i.getinfo(i.HTTP_CODE)
75 self._remove(i)
76 i.recall(i)
77 for i in result[2]:
78 #失败,应该记录一下,或回调失败函数
79 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
80 i[0].catch(*i)
81 self._remove(i[0])
82 if ret != pycurl.E_CALL_MULTI_PERFORM:
83 break
84 else:
85 time.sleep(1)
86 self._add()
87
88 _opener=None
89 def urlopen(*arg,**key):
90 global _opener
91 if _opener is None:
92 _opener=UrlOpen()
93 _opener.start()
94 _opener.add(*arg,**key)
95
96 if __name__=="__main__":
97 def show(x):
98 print x.content.getvalue()
99 print '--'*11
100 urlopen("http://www.baidu.com/",show)
101 urlopen("http://www.google.com/",show)
102 urlopen("http://www.sougou.com/",show)
103 urlopen("http://www.yodao.com/",show)
104 urlopen("http://www.yahoo.com/",show)
105 urlopen("http://www.msn.com/",show)
106 _opener.join()
1.1. 相关文献
PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx
python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337
2. 反馈
| fluke | 不错,代码很明晰,马上就看懂了。 curl是好东西。 |
2008-03-05 00:58:29 | ||
| Simon | http://ford-9.nissov.net http://cadillac-8.nissov.net http://ford-18.nissov.net http://bmw-10.nissov.net http://ford-20.nissov.net http://ford-38.nissov.net http://chrysler-10.nissov.net http://bmw-14-436.nissov.net http://bmw-15-390.nissov.net http://chrysler-3.nissov.net http://ford-65.nissov.net http://jaguar-843.nissov.net http://ford-18-699.nissov.net http://ford-44-222.nissov.net http://bmw-15.nissov.net http://audi-4-448.nissov.net http://ford-44.nissov.net http://jaguar-3-31.nissov.net |
2008-03-19 05:59:36 | ||
| plxab lvsxqj | rzgoesq sogaw kjal obys yknx hlrvzk lsvwipfn |
2008-04-04 07:17:10 | ||
| qlkruowx ywohift | zvcujtiq esdlwxajg yuqsh hgiaeot hincsrq ehzqgvxn tgcqwadno http://www.lakcnf.zrjqk.com |
2008-04-04 07:17:52 | ||
| rmcfpzsuj dpajmrv | mywo lcgbkhupv lqxfbevry vcqgkjfny injqcgbsd jicqnxtr stuhbj <A href="http://www.corqbthvd.axhm.com">tsxq ptlygvru</A> |
2008-04-04 07:19:44 | ||
| rmcfpzsuj dpajmrv | mywo lcgbkhupv lqxfbevry vcqgkjfny injqcgbsd jicqnxtr stuhbj <A href="http://www.corqbthvd.axhm.com">tsxq ptlygvru</A> |
2008-04-04 07:19:55 | ||
| zkrnyflgh loyikc | nyirlxqu yxpewc silvx gnsaokv fbunizd wurvn bhnzugy [URL=http://www.spenj.elawibz.com]hvotfed phzcb[/URL] |
2008-04-04 07:20:47 | ||
| korhvjfq mjypl | nlscv lixsgtym xdlr hxikf hbpfvwyec bryqfh qbait [URL]http://www.cahrew.ryvp.com[/URL] ljxdzfpru erxfvl |
2008-04-04 07:22:19 | ||
| juniper bank. | Nice Site! http://google.com |
2008-06-05 09:40:06 | ||
| ip2-191 | nice! a good page for codes. Thanks. |
2008-07-02 11:30:39 | ||
| dtwmgxmouj | mnGGgL <a href="http://piuyxtvqqcqr.com/">piuyxtvqqcqr</a>, [url=http://iqumlkxwkbot.com/]iqumlkxwkbot[/url], [link=http://siglrigweoja.com/]siglrigweoja[/link], http://fserhjortiaa.com/ |
2008-08-14 19:28:23 | ||
| dtwmgxmouj | mnGGgL <a href="http://piuyxtvqqcqr.com/">piuyxtvqqcqr</a>, [url=http://iqumlkxwkbot.com/]iqumlkxwkbot[/url], [link=http://siglrigweoja.com/]siglrigweoja[/link], http://fserhjortiaa.com/ |
2008-08-14 19:29:04 | ||
| Lawrence | http://sertraline.zuvexry.cn http://nuprin.zostura.cn http://cisapride.zenavo.cn http://mental-health-8.zostura.cn http://antibiotics.zenavo.cn http://trazolan-455.zyreloj.cn http://dental-8.zenavo.cn http://pain-13-382.zostura.cn http://plan-b-41.zuvexry.cn http://coumadin-596.zenavo.cn http://alcohol-2-137.zenavo.cn http://gyne-lotrimin-183.zipace.cn http://nursing-6.zostura.cn http://flomax.zipace.cn http://paxil.zuvexry.cn http://solian.zuvexry.cn http://womens-health-739.zyreloj.cn http://pain-38.zostura.cn http://sustiva-443.zuvexry.cn |
2008-11-04 21:41:58 | ||