1. 起点爬虫

朱丹 <[email protected]>           hide details    3:42 pm (9 minutes ago)  
        reply-to                [email protected]       
        to              [email protected]       
        date            Aug 6, 2007 3:42 PM      
        subject         [CPyUG:30059] 给爱看小说的朋友_起点小说爬虫    
        mailed-by               googlegroups.com

最近闹书荒,不知道看什么书好. 起点上推荐的,排行榜上靠前的烂书真不少,不能用那个来判定. 而且下的时候麻烦死了,点好多下我下txt格式(手机上用)的,,经常下不下来. 好不容易下下来了,还贼难看... 下下来的书名还是一堆id数字,还要自己去修改书名.....麻烦.. 干脆用python写了一个程序,下的那叫一个爽.. 稀里哗啦下了200M的书,慢慢看了..这个爬虫也是一顺的就写完了,没做什么修改,也懒得去改了,能用就行. 下的时候需要提供你要下的页面的url,比如http://www.cmfu.com 会找到这个页面上所有的书来下载

   1 #@+leo-ver=4-thin-encoding=gb2312,.
   2 #@+node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
   3 #@+at
   4 #@nonl
   5 # 起点小说爬虫
   6 #@-at
   7 #@@c
   8 #@@language python
   9 #@+others
  10 #@+node:BIGZHU.20070731161308:import
  11 import httplib,urllib2,urllib,cookielib,re,threading
  12 import os
  13 #@nonl
  14 #@-node:BIGZHU.20070731161308:import
  15 #@+node:BIGZHU.20070731160928:getCookie
  16 def getCookie():
  17     cj = cookielib.CookieJar()#建立Cookie实例
  18     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#建立opener与Cookie关联
  19     return opener
  20 #@-node:BIGZHU.20070731160928:getCookie
  21 #@-others
  22 #@<<getBookIdList>>
  23 #@+node:BIGZHU.20070731160918.1:<<getBookIdList>>
  24 def getBookIdList(opener,urlList):
  25 
  26     BookIdList = []
  27     for i in urlList:
  28         url=i
  29         print url
  30         request = urllib2.Request(url)
  31         cmfu = opener.open(request).read()
  32         #cmfuURL = re.findall("<a href='showbook.asp\?bl_id=\d{1 ,}'",cmfu)
  33         #BookIdListTemp = [re.sub("<a href='showbook.asp\?bl_id=",'',k) for k in cmfuURL]
  34         #BookIdListTemp = [re.sub("'",'',k) for k in BookIdListTemp]
  35         #起点的代码太不规范了,想一个更广泛性的匹配正则表达式
  36         """
  37         cmfuURL = re.findall("showbook.asp\?bl_id=\d{1,}",cmfu)
  38         BookIdListTemp = [re.sub("showbook.asp\?bl_id=",'',k) for k in cmfuURL]
  39         """
  40         #更大众化一些
  41         cmfuURL = re.findall("bl_id=\d{1,}",cmfu)
  42         BookIdListTemp = [re.sub("bl_id=",'',k) for k in cmfuURL]
  43         #BookIdListTemp = [ re.sub("'",'',k) for k in BookIdListTemp]
  44         bookCount = len(BookIdList)
  45         for listTemp in BookIdListTemp:
  46             #检查该bookid是否在BookIdList中已有
  47             if listTemp in BookIdList:
  48                 pass
  49             else:
  50                 BookIdList.extend([listTemp])#加进去
  51         print "取得书本数目:%i"%(len(BookIdList)-bookCount)
  52     print "合计取得下载书本:%i"%len(BookIdList)
  53     return BookIdList
  54 
  55 #@-node:BIGZHU.20070731160918.1:<<getBookIdList>>
  56 #@nl
  57 #@<<getBookName>>
  58 #@+node:BIGZHU.20070731164705:<<getBookName>>
  59 def getBookName(opener,bookId=''):
  60     if bookId == '':
  61         print "传入BookIdList是空的"
  62     bookURL = 'http://www.cmfu.com/readbook.asp?bl_id=%s'%bookId
  63     request = urllib2.Request(bookURL)
  64     bookPage = opener.open(request).read()
  65     opener.close()
  66     bookname =  re.findall('bookname=\S{1,}',bookPage)
  67 
  68     bookname = [re.sub("bookname=",'',k) for k in bookname]
  69     bookname = [re.sub('"','',k) for k in bookname][0]
  70 
  71     return bookname
  72 
  73 #@-node:BIGZHU.20070731164705:<<getBookName>>
  74 #@nl
  75 #@<<getTextFile>>
  76 #@+node: BIGZHU.20070731171721:<<getTextFile>>
  77 def getTextFile(opener,bookId):
  78         bookName = getBookName(opener,bookId)
  79         #判断文件是否已经存在
  80         if os.path.isfile(os.getcwd()+"\\起点\\%s.txt"%bookName):
  81             print "%s 已经存在"%bookName
  82         else:
  83             url = 'http://download.cmfu.com/pda/%s.txt'%bookId
  84             try:
  85                 bookData = opener.open(url).read()
  86             except :
  87                 print "2 %s"%bookName
  88                 try:
  89                     bookData = opener.open(url).read()
  90                 except :
  91                     print "last try %s"%bookName
  92                     try:
  93                         bookData = opener.open(url).read()
  94                     except :
  95                         print "end  try %s"%bookName
  96 
  97             opener.close()
  98 
  99             f=open(os.getcwd()+"\\起点\\%s.txt"%bookName,"wb")
 100             f.write(bookData)
 101             f.close()
 102             print 'get book %s 完毕'%bookName
 103 #@-node:BIGZHU.20070731171721:<<getTextFile>>
 104 #@nl
 105 #@<<class runGetFile>>
 106 #@+node:BIGZHU.20070801172939:<<class runGetFile>>
 107 class runGetFile(threading.Thread):
 108     def __init__(self,bookId):
 109         threading.Thread.__init__(self)
 110         self.bookId = bookId
 111         #self.opener = opener
 112     def run(self):
 113         opener = getCookie()
 114         getTextFile(opener,self.bookId)
 115 #@nonl
 116 #@-node: BIGZHU.20070801172939:<<class runGetFile>>
 117 #@nl
 118 #@<<class ProcessURL>>
 119 #@+node:BIGZHU.20070802171013:<<class ProcessURL>>
 120 class ProcessURL:
 121     """对新输入url,save 到ini中
 122     对已有url,忽视
 123     每次使用,自动读取ini的url,提供使用"""
 124     def __init__(self):
 125         pass
 126     #@    <<saveURL>>
 127     #@+node:BIGZHU.20070802171013.1:<<saveURL>>
 128     def saveURL(self,urlList=[]):
 129         '''存储新的url到URL.ini中'''
 130 
 131 
 132         try:
 133             f=open(os.getcwd()+"\\起点\\URL.ini","wb")#追加内容
 134         except IOError:
 135             print "文件打开错误"
 136             #格式化成字符串
 137         s_urlList = ";".join(urlList)
 138         f.write(s_urlList)
 139         f.close()
 140     #@-node:BIGZHU.20070802171013.1:<<saveURL>>
 141     #@nl
 142     #@    <<getURLIni>>
 143     #@+node:BIGZHU.20070802171013.2:<<getURLIni>>
 144     def getURLIni(self):
 145         """读取 URL.ini中的url
 146         返回一个URL list"""
 147          #判断目录是否存在
 148         if os.path.exists (os.getcwd()+"\\起点"):
 149             pass
 150         else:
 151             print "创建目录 \起点"
 152             os.mkdir("起点")
 153 
 154         iniData=''
 155         if os.path.isfile(os.getcwd ()+"\\起点\\URL.ini"):
 156             f=open(os.getcwd()+"\\起点\\URL.ini","rb")
 157             iniData = f.read()
 158             f.close()
 159         else:
 160             print "URL.txt不存在,创建之"
 161             f=open(os.getcwd()+"\\起点\\URL.ini","wb")
 162             #iniData = f.read()
 163             f.close()
 164         return iniData.split(";")#格式化成list   
 165     #@-node:BIGZHU.20070802171013.2: <<getURLIni>>
 166     #@nl
 167 
 168 
 169 
 170 
 171 #@-node:BIGZHU.20070802171013:<<class ProcessURL>>
 172 #@nl
 173 #@<<main>>
 174 #@+node:BIGZHU.20070731164705.1:<<main>>
 175 if __name__ == '__main__':
 176     opener = getCookie()
 177     #urlList =["http://www.cmfu.com/index.asp"," http://www.cmfu.com/listbookqb.asp?pageid=2007-8-1%2012:26&status=down","http://www.cmfu.com/listbookqb.asp?pageid=2007-7-31%2023:03&status=down ","http://www.cmfu.com/index_wxxx.asp"]
 178     #存放和读取url
 179     urlType = ProcessURL()
 180     urlList = urlType.getURLIni()
 181     saveIni = 0 # 标识是否有url 更新
 182     while True:
 183         url = raw_input("要截取的起点的某个页面:  ")
 184         if url=='':
 185             break
 186         if url in urlList:
 187             print "%s 已有,忽视之"%url
 188         else:
 189             urlList.extend([url])
 190             print "%s 是新的,添加之"%url
 191             saveIni =1
 192     #url = 'http://www.cmfu.com/index.asp'
 193 
 194 
 195     bookIdList=getBookIdList(opener,urlList)
 196 
 197 
 198     for i in bookIdList:
 199         thread = runGetFile(i)
 200         thread.start()
 201     #存储到ini中
 202     if saveIni == 1:
 203         urlType.saveURL(urlList)
 204 #@-node:BIGZHU.20070731164705.1:<<main>>
 205 #@nl
 206 #@nonl
 207 #@-node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
 208 #@-leo

1. 起点爬虫

1.1. 反馈