代码见 http://zspy.googlecode.com

张沈鹏 zsp007@gmail.com http://zsp.javaeye.com/

2008-1-23 16:42

1. PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

   1 
   2 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些
   3 import StringIO
   4 
   5 html = StringIO.StringIO()
   6 
   7 import pycurl
   8 c = pycurl.Curl()
   9 
  10 c.setopt(pycurl.URL, 'http://www.baidu.com')
  11 
  12 #写的回调
  13 c.setopt(pycurl.WRITEFUNCTION, html.write)
  14 
  15 c.setopt(pycurl.FOLLOWLOCATION, 1)
  16 
  17 #最大重定向次数,可以预防重定向陷阱
  18 c.setopt(pycurl.MAXREDIRS, 5)
  19 
  20 #访问,阻塞到访问结束
  21 c.perform()
  22 
  23 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)
  24 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
  25 
  26 #输出百度首页的html
  27 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)

   1 
   2 #!/usr/bin/env python
   3 #coding=utf-8
   4 
   5 import threading
   6 import pycurl
   7 from cStringIO import StringIO
   8 
   9 class UrlOpen(threading.Thread):
  10     """异步下载网页"""
  11 
  12     def __init__(self):
  13         super(UrlOpen,self).__init__()
  14         self.opener = pycurl.CurlMulti()
  15         self.handle_list=[]
  16 
  17     def add(self,url,recall,writer=StringIO()):
  18         """
  19         参数:网址,回调函数,存放临时数据的对象
  20         """
  21         c = pycurl.Curl()
  22 
  23         #可以传给回调函数
  24         c.url=url
  25         c.content = writer
  26         c.recall = recall
  27         c.setopt(c.URL,url)
  28         c.setopt(c.WRITEFUNCTION,c.content.write)
  29 
  30         self.handle_list.append(c)
  31         self.opener.add_handle(c)
  32 
  33     def _remove(self,c):
  34         c.close()
  35         self.opener.remove_handle(c)
  36         self.handle_list.remove(c)
  37 
  38 
  39     def run(self):
  40         num_handle=len(self.handle_list)
  41         while 1:
  42             ret = self.opener.select(10.0)
  43             if ret == -1:  continue
  44             while 1:
  45                 num_handle_pre=num_handle
  46                 ret, num_handle =self.opener.perform()
  47                 #活动的连接数改变时
  48                 if num_handle!=num_handle_pre:
  49                     result=self.opener.info_read()
  50                     print result
  51                     for i in result[1]:
  52                         #成功
  53                         i.http_code = i.getinfo(i.HTTP_CODE)
  54                         self._remove(i)
  55                         i.recall(i)
  56                     for i in result[2]:
  57                         #失败,应该记录一下
  58                         self._remove(i)
  59 
  60                 if ret != pycurl.E_CALL_MULTI_PERFORM:
  61                     break
  62 
  63 _opener=None
  64 def urlopen(*arg,**key):
  65     global _opener
  66     if _opener is None:
  67         _opener=UrlOpen()
  68         _opener.add(*arg,**key)
  69         _opener.start()
  70     else:
  71         _opener.add(*arg,**key)
  72 
  73 def show(x):
  74     print x.content.getvalue()
  75 if __name__=="__main__":
  76     urlopen("http://www.baidu.com/",show)
  77     _opener.join()

又封装了一个异步打开网页的类和函数

   1 #coding=utf-8
   2 
   3 import threading
   4 from cStringIO import StringIO
   5 
   6 import pycurl
   7 """
   8 Asyn open url
   9 Author:zsp007@gmail.com
  10 2008-1-25 17:14
  11 """
  12 
  13 class UrlOpen(threading.Thread):
  14     """异步下载网页"""
  15 
  16     def __init__(self,):
  17         super(UrlOpen,self).__init__()
  18         self.opener = pycurl.CurlMulti()
  19         self.handle_list=[]
  20         self.waiting=[]
  21 
  22     def add(self,url,recall,catch=None,writer=StringIO()):
  23         """
  24         参数:网址,回调函数,存放临时数据的对象
  25         """
  26         if catch is None:
  27             def catch(curl,error_no,desp):
  28                 #print "Error:%s - %s"%(error_no,desp)
  29                 pass
  30 
  31         c = pycurl.Curl()
  32 
  33         #可以传给回调函数
  34         c.url=url
  35         c.content = writer
  36         c.recall = recall
  37         c.catch=catch
  38         c.setopt(c.URL,
  39             url.encode('utf-8') if type(url) is unicode else url
  40         )
  41         c.setopt(c.WRITEFUNCTION,c.content.write)
  42 
  43         self.waiting.append(c)
  44 
  45     def _add(self):
  46         waiting=self.waiting[:]
  47         self.waiting=[]
  48         for c in waiting:
  49             self.handle_list.append(c)
  50             self.opener.add_handle(c)
  51 
  52     def _remove(self,c):
  53         c.close()
  54         self.opener.remove_handle(c)
  55         self.handle_list.remove(c)
  56 
  57 
  58     def run(self):
  59         import select
  60         import time
  61         num_handle=0
  62         while 1:
  63             if self.handle_list:
  64                 ret = self.opener.select(1.0)
  65                 if ret >= 0:
  66                     while 1:
  67                         num_handle_pre=num_handle
  68                         ret, num_handle =self.opener.perform()
  69                         #活动的连接数改变时
  70                         if num_handle!=num_handle_pre:
  71                             result=self.opener.info_read()
  72                             for i in result[1]:
  73                                 #成功
  74                                 i.http_code = i.getinfo(i.HTTP_CODE)
  75                                 self._remove(i)
  76                                 i.recall(i)
  77                             for i in result[2]:
  78                                 #失败,应该记录一下,或回调失败函数
  79                                 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')
  80                                 i[0].catch(*i)
  81                                 self._remove(i[0])
  82                         if ret != pycurl.E_CALL_MULTI_PERFORM:
  83                             break
  84             else:
  85                 time.sleep(1)
  86             self._add()
  87 
  88 _opener=None
  89 def urlopen(*arg,**key):
  90     global _opener
  91     if _opener is None:
  92         _opener=UrlOpen()
  93         _opener.start()
  94     _opener.add(*arg,**key)
  95 
  96 if __name__=="__main__":
  97     def show(x):
  98         print x.content.getvalue()
  99         print '--'*11
 100     urlopen("http://www.baidu.com/",show)
 101     urlopen("http://www.google.com/",show)
 102     urlopen("http://www.sougou.com/",show)
 103     urlopen("http://www.yodao.com/",show)
 104     urlopen("http://www.yahoo.com/",show)
 105     urlopen("http://www.msn.com/",show)
 106     _opener.join()

1.1. 相关文献

2. 反馈

Name Password4deL ;) :( X-( B-)
;) fluke   不错,代码很明晰,马上就看懂了。
curl是好东西。
2008-03-05 00:58:29
;) Simon   http://ford-9.nissov.net http://cadillac-8.nissov.net http://ford-18.nissov.net http://bmw-10.nissov.net http://ford-20.nissov.net http://ford-38.nissov.net http://chrysler-10.nissov.net http://bmw-14-436.nissov.net http://bmw-15-390.nissov.net http://chrysler-3.nissov.net http://ford-65.nissov.net http://jaguar-843.nissov.net http://ford-18-699.nissov.net http://ford-44-222.nissov.net http://bmw-15.nissov.net http://audi-4-448.nissov.net http://ford-44.nissov.net
http://jaguar-3-31.nissov.net
2008-03-19 05:59:36
plxab lvsxqj   rzgoesq sogaw kjal obys yknx hlrvzk lsvwipfn
2008-04-04 07:17:10
qlkruowx ywohift   zvcujtiq esdlwxajg yuqsh hgiaeot hincsrq ehzqgvxn tgcqwadno http://www.lakcnf.zrjqk.com
2008-04-04 07:17:52
rmcfpzsuj dpajmrv   mywo lcgbkhupv lqxfbevry vcqgkjfny injqcgbsd jicqnxtr stuhbj <A href="http://www.corqbthvd.axhm.com">tsxq ptlygvru</A>
2008-04-04 07:19:44
rmcfpzsuj dpajmrv   mywo lcgbkhupv lqxfbevry vcqgkjfny injqcgbsd jicqnxtr stuhbj <A href="http://www.corqbthvd.axhm.com">tsxq ptlygvru</A>
2008-04-04 07:19:55
zkrnyflgh loyikc   nyirlxqu yxpewc silvx gnsaokv fbunizd wurvn bhnzugy [URL=http://www.spenj.elawibz.com]hvotfed phzcb[/URL]
2008-04-04 07:20:47
korhvjfq mjypl   nlscv lixsgtym xdlr hxikf hbpfvwyec bryqfh qbait [URL]http://www.cahrew.ryvp.com[/URL] ljxdzfpru erxfvl
2008-04-04 07:22:19
;) juniper bank.   Nice Site!
http://google.com
2008-06-05 09:40:06
;) ip2-191   nice! a good page for codes. Thanks.
2008-07-02 11:30:39
X-( dtwmgxmouj   mnGGgL  <a href="http://piuyxtvqqcqr.com/">piuyxtvqqcqr</a>, [url=http://iqumlkxwkbot.com/]iqumlkxwkbot[/url], [link=http://siglrigweoja.com/]siglrigweoja[/link], http://fserhjortiaa.com/
2008-08-14 19:28:23
X-( dtwmgxmouj   mnGGgL  <a href="http://piuyxtvqqcqr.com/">piuyxtvqqcqr</a>, [url=http://iqumlkxwkbot.com/]iqumlkxwkbot[/url], [link=http://siglrigweoja.com/]siglrigweoja[/link], http://fserhjortiaa.com/
2008-08-14 19:29:04
;) Lawrence   http://sertraline.zuvexry.cn http://nuprin.zostura.cn http://cisapride.zenavo.cn http://mental-health-8.zostura.cn http://antibiotics.zenavo.cn http://trazolan-455.zyreloj.cn http://dental-8.zenavo.cn http://pain-13-382.zostura.cn http://plan-b-41.zuvexry.cn http://coumadin-596.zenavo.cn http://alcohol-2-137.zenavo.cn http://gyne-lotrimin-183.zipace.cn http://nursing-6.zostura.cn http://flomax.zipace.cn http://paxil.zuvexry.cn http://solian.zuvexry.cn http://womens-health-739.zyreloj.cn http://pain-38.zostura.cn
http://sustiva-443.zuvexry.cn
2008-11-04 21:41:58